From cecab48f24e971599845baa9a408d988742ccb15 Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Thu, 22 Jan 2026 15:53:21 +0100 Subject: [PATCH 1/5] feat: support data.pypsa.org and its manifests --- .../__init__.py | 205 +++++++++++++----- .../monkeypatch.py | 10 +- 2 files changed, 152 insertions(+), 63 deletions(-) diff --git a/src/snakemake_storage_plugin_cached_http/__init__.py b/src/snakemake_storage_plugin_cached_http/__init__.py index 1640c9a..7abf825 100644 --- a/src/snakemake_storage_plugin_cached_http/__init__.py +++ b/src/snakemake_storage_plugin_cached_http/__init__.py @@ -5,17 +5,18 @@ import asyncio import hashlib import json -from logging import Logger import shutil import time from contextlib import asynccontextmanager from dataclasses import dataclass, field +from logging import Logger from pathlib import Path -from typing_extensions import override +from posixpath import basename, dirname, join, normpath, relpath from urllib.parse import urlparse import httpx import platformdirs +import yaml from reretry import retry # pyright: ignore[reportUnknownVariableType] from snakemake_interface_common.exceptions import WorkflowError from snakemake_interface_common.logging import get_logger @@ -30,9 +31,10 @@ StorageQueryValidationResult, ) from tqdm_loggable.auto import tqdm +from typing_extensions import override from .cache import Cache -from .monkeypatch import is_zenodo_url # noqa: F401 - applies monkeypatch on import +from .monkeypatch import is_pypsa_or_zenodo_url logger = get_logger() @@ -90,11 +92,12 @@ class StorageProviderSettings(SettingsBase): @dataclass -class ZenodoFileMetadata: - """Metadata for a file in a Zenodo record.""" +class FileMetadata: + """Metadata for a file in a Zenodo or data.pypsa.org record.""" checksum: str | None size: int + redirect: str | None = None # used to indicate data.pypsa.org redirection class WrongChecksum(Exception): @@ -140,7 +143,8 @@ def __post_init__(self): self._client_refcount: int = 0 # Cache for record metadata to avoid repeated API calls - self._record_cache: dict[str, dict[str, ZenodoFileMetadata]] = {} + self._zenodo_record_cache: dict[str, dict[str, FileMetadata]] = {} + self._pypsa_manifest_cache: dict[str, dict[str, FileMetadata]] = {} @override def use_rate_limiter(self) -> bool: @@ -162,16 +166,21 @@ def example_queries(cls) -> list[ExampleQuery]: return [ ExampleQuery( query="https://zenodo.org/records/17249457/files/ARDECO-SNPTD.2021.table.csv", - description="A Zenodo file URL (currently the only supported HTTP source)", + description="A Zenodo file URL", type=QueryType.INPUT, - ) + ), + ExampleQuery( + query="https://data.pypsa.org/workflows/eur/eez/v12_20231025/World_EEZ_v12_20231025_LR.zip", + description="A data pypsa file URL", + type=QueryType.INPUT, + ), ] @override @classmethod def is_valid_query(cls, query: str) -> StorageQueryValidationResult: """Only handle zenodo.org URLs""" - if is_zenodo_url(query): + if is_pypsa_or_zenodo_url(query): return StorageQueryValidationResult(query=query, valid=True) return StorageQueryValidationResult( @@ -265,22 +274,50 @@ async def httpr(self, method: str, url: str): raise @retry_decorator - async def get_metadata( - self, record_id: str, netloc: str - ) -> dict[str, ZenodoFileMetadata]: + async def get_metadata(self, path: str, netloc: str) -> FileMetadata | None: """ - Retrieve and cache file metadata for a Zenodo record. + Retrieve and cache file metadata for a Zenodo record or a data.pypsa.org file. Args: - record_id: The Zenodo record ID + path: Server path + netloc: Network location (e.g., "zenodo.org") + + Returns: + Dictionary mapping filename to FileMetadata + """ + if netloc in ("zenodo.org", "sandbox.zenodo.org"): + return await self.get_zenodo_metadata(path, netloc) + elif netloc in "data.pypsa.org": + return await self.get_pypsa_metadata(path, netloc) + + raise WorkflowError( + "Cached-http storage plugin is only implemented for zenodo.org and data.pypsa.org urls" + ) + + async def get_zenodo_metadata(self, path: str, netloc: str) -> FileMetadata | None: + """ + Retrieve and cache file metadata for a Zenodo record or a data.pypsa.org file. + + Args: + path: Server path netloc: Network location (e.g., "zenodo.org") Returns: - Dictionary mapping filename to ZenodoFileMetadata + Dictionary mapping filename to FileMetadata """ + + # Zenodo record + _records, record_id, _files, filename = path.split("/", maxsplit=3) + + if _records != "records" or _files != "files": + raise WorkflowError( + f"Invalid Zenodo URL format: http(s)://{netloc}/{path}. " + f"Expected format: https://zenodo.org/records/{{record_id}}/files/{{filename}}" + ) + # Check cache first - if record_id in self._record_cache: - return self._record_cache[record_id] + if record_id in self._zenodo_record_cache: + return self._zenodo_record_cache[record_id].get(filename) # Fetch from API api_url = f"https://{netloc}/api/records/{record_id}" @@ -296,30 +333,87 @@ async def get_metadata( data = json.loads(content) # Parse files array and build metadata dict - metadata: dict[str, ZenodoFileMetadata] = {} + metadata: dict[str, FileMetadata] = {} files = data.get("files", []) for file_info in files: - filename: str | None = file_info.get("key") + fn: str | None = file_info.get("key") checksum: str | None = file_info.get("checksum") size: int = file_info.get("size", 0) - if not filename: + if not fn: continue - metadata[filename] = ZenodoFileMetadata(checksum=checksum, size=size) + metadata[fn] = FileMetadata(checksum=checksum, size=size) + + # Store in cache + self._zenodo_record_cache[record_id] = metadata + + return metadata.get(filename) + + async def get_pypsa_metadata(self, path: str, netloc: str) -> FileMetadata | None: + """ + Retrieve and cache file metadata from data.pypsa.org manifest. + + Args: + record_id: The Zenodo record ID + netloc: Network location (e.g., "data.pypsa.org") + + Returns: + Dictionary mapping filename to FileMetadata + """ + + # Check cache first + base_path = dirname(path) + while base_path: + if base_path in self._pypsa_manifest_cache: + filename = relpath(path, base_path) + return self._pypsa_manifest_cache[base_path].get(filename) + base_path = dirname(base_path) + + # Fetch manifest + base_path = dirname(path) + while base_path: + manifest_url = f"https://{netloc}/{base_path}/manifest.yaml" + + async with self.httpr("get", manifest_url) as response: + if response.status_code == 200: + content = await response.aread() + data = yaml.safe_load(content) + break + + base_path = dirname(base_path) + else: + raise WorkflowError( + f"Failed to fetch data.pypsa.org manifest for https://{netloc}/{path}" + ) + + # Parse files array and build metadata dict + metadata: dict[str, FileMetadata] = {} + files = data.get("files", {}) + for filename, file_info in files.items(): + redirect: str | None = file_info.get("redirect") + checksum: str | None = file_info.get("checksum") + size: int = file_info.get("size", 0) + + if redirect is not None: + redirect = normpath(join(base_path, redirect)) + + metadata[filename] = FileMetadata( + checksum=checksum, size=size, redirect=redirect + ) # Store in cache - self._record_cache[record_id] = metadata + self._pypsa_manifest_cache[base_path] = metadata - return metadata + filename = relpath(path, base_path) + return metadata.get(filename) # Implementation of storage object class StorageObject(StorageObjectRead): provider: StorageProvider # pyright: ignore[reportIncompatibleVariableOverride] - record_id: str - filename: str netloc: str + path: str def __post_init__(self): super().__post_init__() @@ -327,25 +421,13 @@ def __post_init__(self): # Parse URL to extract record ID and filename # URL format: https://zenodo.org/records/{record_id}/files/{filename} parsed = urlparse(str(self.query)) - _records, record_id, _files, filename = parsed.path.strip("/").split( - "/", maxsplit=3 - ) - - if _records != "records" or _files != "files": - raise WorkflowError( - f"Invalid Zenodo URL format: {self.query}. " - f"Expected format: https://zenodo.org/records/{{record_id}}/files/{{filename}}" - ) - - self.record_id = record_id - self.filename = filename self.netloc = parsed.netloc + self.path = parsed.path.strip("/") @override def local_suffix(self) -> str: """Return the local suffix for this object (used by parent class).""" - parsed = urlparse(str(self.query)) - return f"{parsed.netloc}{parsed.path}" + return f"{self.netloc}{self.path}" @override def get_inventory_parent(self) -> str | None: @@ -363,8 +445,8 @@ async def managed_exists(self) -> bool: if cached is not None: return True - metadata = await self.provider.get_metadata(self.record_id, self.netloc) - return self.filename in metadata + metadata = await self.provider.get_metadata(self.path, self.netloc) + return metadata is not None @override async def managed_mtime(self) -> float: @@ -380,8 +462,8 @@ async def managed_size(self) -> int: if cached is not None: return cached.stat().st_size - metadata = await self.provider.get_metadata(self.record_id, self.netloc) - return metadata[self.filename].size if self.filename in metadata else 0 + metadata = await self.provider.get_metadata(self.path, self.netloc) + return metadata.size if metadata is not None else 0 @override async def inventory(self, cache: IOCacheStorageInterface) -> None: @@ -408,11 +490,11 @@ async def inventory(self, cache: IOCacheStorageInterface) -> None: cache.size[key] = cached.stat().st_size return - metadata = await self.provider.get_metadata(self.record_id, self.netloc) - exists = self.filename in metadata + metadata = await self.provider.get_metadata(self.path, self.netloc) + exists = metadata is not None cache.exists_in_storage[key] = exists cache.mtime[key] = Mtime(storage=0) - cache.size[key] = metadata[self.filename].size if exists else 0 + cache.size[key] = metadata.size if exists else 0 @override def cleanup(self): @@ -443,13 +525,11 @@ async def verify_checksum(self, path: Path) -> None: WrongChecksum """ # Get cached or fetch record metadata - metadata = await self.provider.get_metadata(self.record_id, self.netloc) - if self.filename not in metadata: - raise WorkflowError( - f"File {self.filename} not found in Zenodo record {self.record_id}" - ) + metadata = await self.provider.get_metadata(self.path, self.netloc) + if metadata is None: + raise WorkflowError(f"No metadata found for {self.query}") - checksum = metadata[self.filename].checksum + checksum = metadata.checksum if checksum is None: return @@ -472,23 +552,28 @@ async def managed_retrieve(self): local_path = self.local_path() local_path.parent.mkdir(parents=True, exist_ok=True) + query = str(self.query) + filename = basename(self.path) + + metadata = self.provider.get_metadata(self.path, self.netloc) + if metadata.redirect is not None: + query = f"https://{self.netloc}/{metadata.redirect}" + # If already in cache, just copy if self.provider.cache: cached = self.provider.cache.get(str(self.query)) if cached is not None: - logger.info( - f"Retrieved {self.filename} of zenodo record {self.record_id} from cache" - ) + logger.info(f"Retrieved {filename} from cache ({query})") shutil.copy2(cached, local_path) return try: - # Download from Zenodo using a get request, rate limit errors are detected and + # Download from Zenodo or data.pypsa.org using a get request, rate limit errors are detected and # raise WorkflowError to trigger a retry - async with self.provider.httpr("get", str(self.query)) as response: + async with self.provider.httpr("get", query) as response: if response.status_code != 200: raise WorkflowError( - f"Failed to download from Zenodo: HTTP {response.status_code} ({self.query})" + f"Failed to download: HTTP {response.status_code} ({query})" ) total_size = int(response.headers.get("content-length", 0)) @@ -499,7 +584,7 @@ async def managed_retrieve(self): total=total_size, unit="B", unit_scale=True, - desc=self.filename, + desc=filename, position=None, leave=True, ) as pbar: @@ -511,7 +596,7 @@ async def managed_retrieve(self): # Copy to cache after successful verification if self.provider.cache: - self.provider.cache.put(str(self.query), local_path) + self.provider.cache.put(query, local_path) except: if local_path.exists(): diff --git a/src/snakemake_storage_plugin_cached_http/monkeypatch.py b/src/snakemake_storage_plugin_cached_http/monkeypatch.py index dc43c32..2036624 100644 --- a/src/snakemake_storage_plugin_cached_http/monkeypatch.py +++ b/src/snakemake_storage_plugin_cached_http/monkeypatch.py @@ -17,9 +17,13 @@ ) -def is_zenodo_url(url: str) -> bool: +def is_pypsa_or_zenodo_url(url: str) -> bool: parsed = urlparse(url) - return parsed.netloc in ("zenodo.org", "sandbox.zenodo.org") and parsed.scheme in ( + return parsed.netloc in ( + "zenodo.org", + "sandbox.zenodo.org", + "data.pypsa.org", + ) and parsed.scheme in ( "http", "https", ) @@ -34,7 +38,7 @@ def is_zenodo_url(url: str) -> bool: valid=False, reason="Deactivated in favour of cached_http", ) - if is_zenodo_url(q) + if is_pypsa_or_zenodo_url(q) else orig_valid_query(q) ) ) From d49b2026da410ea995ee2e7652707ac48fd5538f Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Thu, 22 Jan 2026 16:07:33 +0100 Subject: [PATCH 2/5] fix: bug and doc fixes --- .../__init__.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/snakemake_storage_plugin_cached_http/__init__.py b/src/snakemake_storage_plugin_cached_http/__init__.py index 7abf825..b4dd009 100644 --- a/src/snakemake_storage_plugin_cached_http/__init__.py +++ b/src/snakemake_storage_plugin_cached_http/__init__.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: MIT -import asyncio import hashlib import json import shutil @@ -186,7 +185,7 @@ def is_valid_query(cls, query: str) -> StorageQueryValidationResult: return StorageQueryValidationResult( query=query, valid=False, - reason="Not a Zenodo URL (only zenodo.org URLs are handled by this plugin)", + reason="Only zenodo.org and data.pypsa.org URLs are handled by this plugin", ) @override @@ -287,7 +286,7 @@ async def get_metadata(self, path: str, netloc: str) -> FileMetadata | None: """ if netloc in ("zenodo.org", "sandbox.zenodo.org"): return await self.get_zenodo_metadata(path, netloc) - elif netloc in "data.pypsa.org": + elif netloc == "data.pypsa.org": return await self.get_pypsa_metadata(path, netloc) raise WorkflowError( @@ -355,11 +354,11 @@ async def get_pypsa_metadata(self, path: str, netloc: str) -> FileMetadata | Non Retrieve and cache file metadata from data.pypsa.org manifest. Args: - record_id: The Zenodo record ID + path: Server path netloc: Network location (e.g., "data.pypsa.org") Returns: - Dictionary mapping filename to FileMetadata + FileMetadata for the requested file, or None if not found """ # Check cache first @@ -555,13 +554,13 @@ async def managed_retrieve(self): query = str(self.query) filename = basename(self.path) - metadata = self.provider.get_metadata(self.path, self.netloc) - if metadata.redirect is not None: + metadata = await self.provider.get_metadata(self.path, self.netloc) + if metadata is not None and metadata.redirect is not None: query = f"https://{self.netloc}/{metadata.redirect}" # If already in cache, just copy if self.provider.cache: - cached = self.provider.cache.get(str(self.query)) + cached = self.provider.cache.get(query) if cached is not None: logger.info(f"Retrieved {filename} from cache ({query})") shutil.copy2(cached, local_path) From b4dbc4cee3552286e5512548bebcb70f7ca25211 Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Thu, 22 Jan 2026 16:17:47 +0100 Subject: [PATCH 3/5] fix: update tests --- tests/test_download.py | 85 +++++++++++++++++++++++++++--------------- tests/test_import.py | 15 ++++++-- 2 files changed, 66 insertions(+), 34 deletions(-) diff --git a/tests/test_download.py b/tests/test_download.py index 704d62c..b0abe5a 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: MIT -"""Functional tests for downloading and checksumming from Zenodo.""" +"""Functional tests for downloading and checksumming from Zenodo and data.pypsa.org.""" import json import logging @@ -16,7 +16,21 @@ WrongChecksum, ) -TEST_URL = "https://zenodo.org/records/16810901/files/attributed_ports.json" +# Test URLs and their metadata paths +TEST_CONFIGS = { + "zenodo": { + "url": "https://zenodo.org/records/16810901/files/attributed_ports.json", + "path": "records/16810901/files/attributed_ports.json", + "netloc": "zenodo.org", + "has_size": True, + }, + "pypsa": { + "url": "https://data.pypsa.org/workflows/eur/attributed_ports/2020-07-10/attributed_ports.json", + "path": "workflows/eur/attributed_ports/2020-07-10/attributed_ports.json", + "netloc": "data.pypsa.org", + "has_size": False, # data.pypsa.org manifests don't include size + }, +} @pytest.fixture @@ -44,33 +58,36 @@ def storage_provider(tmp_path): return provider +@pytest.fixture(params=["zenodo", "pypsa"]) +def test_config(request): + """Provide test configuration (parametrized for zenodo and pypsa).""" + return TEST_CONFIGS[request.param] + + @pytest.fixture -def storage_object(storage_provider): - """Create a StorageObject for the test file.""" - # Create storage object +def storage_object(test_config, storage_provider): + """Create a StorageObject for the test file (parametrized for zenodo and pypsa).""" obj = StorageObject( - query=TEST_URL, + query=test_config["url"], keep_local=False, retrieve=True, provider=storage_provider, ) - yield obj @pytest.mark.asyncio -async def test_zenodo_metadata_fetch(storage_provider): - """Test that we can fetch metadata from Zenodo API.""" - record_id = "16810901" - netloc = "zenodo.org" - - metadata = await storage_provider.get_metadata(record_id, netloc) +async def test_metadata_fetch(storage_provider, test_config): + """Test that we can fetch metadata from the API/manifest.""" + metadata = await storage_provider.get_metadata( + test_config["path"], test_config["netloc"] + ) - assert "attributed_ports.json" in metadata - file_meta = metadata["attributed_ports.json"] - assert file_meta.checksum is not None - assert file_meta.size > 0 - assert file_meta.checksum.startswith("md5:") + assert metadata is not None + assert metadata.checksum is not None + assert metadata.checksum.startswith("md5:") + if test_config["has_size"]: + assert metadata.size > 0 @pytest.mark.asyncio @@ -81,17 +98,21 @@ async def test_storage_object_exists(storage_object): @pytest.mark.asyncio -async def test_storage_object_size(storage_object): +async def test_storage_object_size(storage_object, test_config): """Test that the storage object reports size correctly.""" size = await storage_object.managed_size() - assert size > 0 - # The file is a small JSON file, should be less than 1MB - assert size < 1_000_000 + if test_config["has_size"]: + assert size > 0 + # The file is a small JSON file, should be less than 1MB + assert size < 1_000_000 + else: + # data.pypsa.org manifests don't include size + assert size == 0 @pytest.mark.asyncio async def test_storage_object_mtime(storage_object): - """Test that mtime is 0 for immutable Zenodo URLs.""" + """Test that mtime is 0 for immutable URLs.""" mtime = await storage_object.managed_mtime() assert mtime == 0 @@ -112,21 +133,23 @@ async def test_download_and_checksum(storage_object, tmp_path): assert local_path.exists() assert local_path.stat().st_size > 0 - # Verify it's valid JSON (use utf-8 with error handling for any encoding issues) + # Verify it's valid JSON with open(local_path, encoding="utf-8", errors="replace") as f: data = json.load(f) - assert isinstance(data, dict) + assert isinstance(data, (dict, list)) # Verify checksum (should not raise WrongChecksum exception) await storage_object.verify_checksum(local_path) @pytest.mark.asyncio -async def test_cache_functionality(storage_provider, tmp_path): +async def test_cache_functionality(storage_provider, test_config, tmp_path): """Test that files are cached after download.""" + url = test_config["url"] + # First download obj1 = StorageObject( - query=TEST_URL, + query=url, keep_local=False, retrieve=True, provider=storage_provider, @@ -140,13 +163,13 @@ async def test_cache_functionality(storage_provider, tmp_path): # Verify cache was populated assert obj1.provider.cache is not None - cached_path = obj1.provider.cache.get(str(obj1.query)) + cached_path = obj1.provider.cache.get(url) assert cached_path is not None assert cached_path.exists() # Second download should use cache obj2 = StorageObject( - query=TEST_URL, + query=url, keep_local=False, retrieve=True, provider=storage_provider, @@ -163,7 +186,7 @@ async def test_cache_functionality(storage_provider, tmp_path): @pytest.mark.asyncio -async def test_skip_remote_checks(tmp_path): +async def test_skip_remote_checks(test_config, tmp_path): """Test that skip_remote_checks works correctly.""" local_prefix = tmp_path / "local" local_prefix.mkdir() @@ -183,7 +206,7 @@ async def test_skip_remote_checks(tmp_path): ) obj = StorageObject( - query=TEST_URL, + query=test_config["url"], keep_local=False, retrieve=True, provider=provider_skip, diff --git a/tests/test_import.py b/tests/test_import.py index dfcd378..194322c 100644 --- a/tests/test_import.py +++ b/tests/test_import.py @@ -55,14 +55,23 @@ def test_is_valid_query_zenodo(): assert result.valid is True -def test_is_valid_query_non_zenodo(): +def test_is_valid_query_pypsa(): + """Test that is_valid_query accepts data.pypsa.org URLs.""" + from snakemake_storage_plugin_cached_http import StorageProvider + + result = StorageProvider.is_valid_query( + "https://data.pypsa.org/workflows/eur/eez/v12_20231025/World_EEZ_v12_20231025_LR.zip" + ) + assert result.valid is True + + +def test_is_valid_query_non_zenodo_or_pypsa(): """Test that is_valid_query rejects non-zenodo URLs.""" from snakemake_storage_plugin_cached_http import StorageProvider - # Non-Zenodo URL should be rejected + # Non-Zenodo/PyPSA URL should be rejected result = StorageProvider.is_valid_query("https://example.com/file.txt") assert result.valid is False - assert "zenodo" in result.reason.lower() def test_example_queries(): From ec90293b6a88376f63c9adb06e49e69bf168ea30 Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Thu, 22 Jan 2026 16:22:47 +0100 Subject: [PATCH 4/5] Update README.md --- README.md | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 9076e5e..3ee9120 100644 --- a/README.md +++ b/README.md @@ -7,12 +7,14 @@ SPDX-License-Identifier: CC-BY-4.0 A Snakemake storage plugin for downloading files via HTTP with local caching, checksum verification, and adaptive rate limiting. -**Note:** This plugin is currently specifically designed for zenodo.org URLs. +**Supported sources:** +- **zenodo.org** - Zenodo data repository (checksum from API) +- **data.pypsa.org** - PyPSA data repository (checksum from manifest.yaml) ## Features - **Local caching**: Downloads are cached to avoid redundant transfers (can be disabled) -- **Checksum verification**: Automatically verifies MD5 checksums from Zenodo API +- **Checksum verification**: Automatically verifies checksums (from Zenodo API or data.pypsa.org manifests) - **Rate limit handling**: Automatically respects Zenodo's rate limits using `X-RateLimit-*` headers with exponential backoff retry - **Concurrent download control**: Limits simultaneous downloads to prevent overwhelming Zenodo - **Progress bars**: Shows download progress with tqdm @@ -64,16 +66,24 @@ If you don't explicitly configure it, the plugin will use default settings autom ## Usage -Use Zenodo URLs directly in your rules. Snakemake automatically detects zenodo.org URLs and routes them to this plugin: +Use Zenodo or data.pypsa.org URLs directly in your rules. Snakemake automatically detects supported URLs and routes them to this plugin: ```python -rule download_data: +rule download_zenodo: input: storage("https://zenodo.org/records/3520874/files/natura.tiff"), output: "resources/natura.tiff" shell: "cp {input} {output}" + +rule download_pypsa: + input: + storage("https://data.pypsa.org/workflows/eur/eez/v12_20231025/World_EEZ_v12_20231025_LR.zip"), + output: + "resources/eez.zip" + shell: + "cp {input} {output}" ``` Or if you configured a tagged storage entity: @@ -93,11 +103,11 @@ rule download_data: The plugin will: 1. Check if the file exists in the cache (if caching is enabled) 2. If cached, copy from cache (fast) -3. If not cached, download from Zenodo with: +3. If not cached, download with: - Progress bar showing download status - Automatic rate limit handling with exponential backoff retry - Concurrent download limiting - - MD5 checksum verification against Zenodo API metadata + - Checksum verification (from Zenodo API or data.pypsa.org manifest) 4. Store in cache for future use (if caching is enabled) ### Example: CI/CD Configuration @@ -129,19 +139,19 @@ The plugin automatically: ## URL Handling -- Only handles URLs containing `zenodo.org` +- Handles URLs from `zenodo.org`, `sandbox.zenodo.org`, and `data.pypsa.org` - Other HTTP(S) URLs are handled by the standard `snakemake-storage-plugin-http` - Both plugins can coexist in the same workflow ### Plugin Priority When using `storage()` without specifying a plugin name, Snakemake checks all installed plugins: -- **Cached HTTP plugin**: Only accepts zenodo.org URLs (`is_valid_query` returns True only for zenodo.org) +- **Cached HTTP plugin**: Only accepts zenodo.org and data.pypsa.org URLs - **HTTP plugin**: Accepts all HTTP/HTTPS URLs (including zenodo.org) -If both plugins are installed, zenodo.org URLs are ambiguous - both plugins accept them. -Typically snakemake would raise an error: **"Multiple suitable storage providers found"** if you try to use `storage()` without specifying which plugin to use, ie. one needs to explicitly call the Cached HTTP provider for zenodo.org URLs using `storage.cached_http(url)` instead of `storage(url)`, -but we monkey-patch the http plugin to refuse zenodo.org urls. +If both plugins are installed, supported URLs would be ambiguous - both plugins accept them. +Typically snakemake would raise an error: **"Multiple suitable storage providers found"** if you try to use `storage()` without specifying which plugin to use, ie. one needs to explicitly call the Cached HTTP provider using `storage.cached_http(url)` instead of `storage(url)`, +but we monkey-patch the http plugin to refuse zenodo.org and data.pypsa.org URLs. ## License From 57562d3930f2b964b02edae2a8d4a81a70ba08fd Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Thu, 22 Jan 2026 17:04:52 +0100 Subject: [PATCH 5/5] fix: improve caching test --- tests/conftest.py | 29 +++++++++++++++++++++++++++++ tests/test_download.py | 6 ++++-- 2 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 tests/conftest.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..78f7805 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Contributors to PyPSA-Eur +# +# SPDX-License-Identifier: MIT + +"""Pytest configuration and shared fixtures.""" + +from contextlib import contextmanager + + +@contextmanager +def assert_no_http_requests(provider): + """ + Context manager that fails if any HTTP requests are made. + + Usage: + with assert_no_http_requests(storage_provider): + await obj.managed_retrieve() # Should use cache, not HTTP + """ + original_httpr = provider.httpr + + async def httpr_should_not_be_called(*args, **kwargs): + raise AssertionError("HTTP request made when none was expected") + + provider.httpr = httpr_should_not_be_called + + try: + yield + finally: + provider.httpr = original_httpr diff --git a/tests/test_download.py b/tests/test_download.py index b0abe5a..6690542 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -15,6 +15,7 @@ StorageProviderSettings, WrongChecksum, ) +from tests.conftest import assert_no_http_requests # Test URLs and their metadata paths TEST_CONFIGS = { @@ -167,7 +168,7 @@ async def test_cache_functionality(storage_provider, test_config, tmp_path): assert cached_path is not None assert cached_path.exists() - # Second download should use cache + # Second download should use cache - verify by checking no HTTP requests are made obj2 = StorageObject( query=url, keep_local=False, @@ -179,7 +180,8 @@ async def test_cache_functionality(storage_provider, test_config, tmp_path): local_path2.parent.mkdir(parents=True, exist_ok=True) obj2.local_path = lambda: local_path2 - await obj2.managed_retrieve() + with assert_no_http_requests(storage_provider): + await obj2.managed_retrieve() # Both files should be identical assert local_path1.read_bytes() == local_path2.read_bytes()