From 1e4daed41cde417bf7c031f4e75fab66e1120964 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 4 Dec 2025 12:57:21 +0100 Subject: [PATCH 1/5] Add xcube support --- environment.yml | 2 + esmvalcore/cmor/table.py | 3 + esmvalcore/config-developer.yml | 2 + .../configurations/data-xcube-ccizarr.yml | 8 + esmvalcore/dataset.py | 60 ++--- esmvalcore/io/xcube.py | 210 ++++++++++++++++++ pyproject.toml | 2 +- 7 files changed, 258 insertions(+), 29 deletions(-) create mode 100644 esmvalcore/config/configurations/data-xcube-ccizarr.yml create mode 100644 esmvalcore/io/xcube.py diff --git a/environment.yml b/environment.yml index 659cf4918d..4f74d6df05 100644 --- a/environment.yml +++ b/environment.yml @@ -48,6 +48,8 @@ dependencies: - scipy >=1.6 - shapely >=2.0.0 - xarray + - xcube + - xcube-cci - yamale - zarr >3 # Python packages needed for building docs diff --git a/esmvalcore/cmor/table.py b/esmvalcore/cmor/table.py index abd42af045..255c219e47 100644 --- a/esmvalcore/cmor/table.py +++ b/esmvalcore/cmor/table.py @@ -40,6 +40,9 @@ def _update_cmor_facets(facets): """Update `facets` with information from CMOR table.""" project = facets["project"] + if project == "external": + facets["original_short_name"] = facets["short_name"] + return mip = facets["mip"] short_name = facets["short_name"] derive = facets.get("derive", False) diff --git a/esmvalcore/config-developer.yml b/esmvalcore/config-developer.yml index 32691ab504..4ace8ccc19 100644 --- a/esmvalcore/config-developer.yml +++ b/esmvalcore/config-developer.yml @@ -26,6 +26,8 @@ # cmor_path: ~/my/own/custom_tables ############################################################################### --- +external: + output_file: "{dataset}_{short_name}" CMIP6: cmor_strict: true diff --git a/esmvalcore/config/configurations/data-xcube-ccizarr.yml b/esmvalcore/config/configurations/data-xcube-ccizarr.yml new file mode 100644 index 0000000000..e53e74fc99 --- /dev/null +++ b/esmvalcore/config/configurations/data-xcube-ccizarr.yml @@ -0,0 +1,8 @@ +# Read ESA Climate Data Centre (ESA CCI) using xcube. +# https://xcube.readthedocs.io/en/latest/dataaccess.html#esa-climate-data-centre-esa-cci-cciodp-ccizarr-esa-cci-kc +projects: + external: + data: + ccizarr: + type: "esmvalcore.io.xcube.XCubeDataSource" + data_store_id: "ccizarr" diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 6be9687a15..53848e4f0a 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -840,40 +840,44 @@ def _load(self) -> Cube: ) settings: dict[str, dict[str, Any]] = {} - settings["fix_file"] = { - "output_dir": fix_dir_prefix, - "add_unique_suffix": True, - "session": self.session, - **self.facets, - } + if self.facets["project"] != "external": + settings["fix_file"] = { + "output_dir": fix_dir_prefix, + "add_unique_suffix": True, + "session": self.session, + **self.facets, + } settings["load"] = {} - settings["fix_metadata"] = { - "session": self.session, - **self.facets, - } + if self.facets["project"] != "external": + settings["fix_metadata"] = { + "session": self.session, + **self.facets, + } settings["concatenate"] = {"check_level": self.session["check_level"]} - settings["cmor_check_metadata"] = { - "check_level": self.session["check_level"], - "cmor_table": self.facets["project"], - "mip": self.facets["mip"], - "frequency": self.facets["frequency"], - "short_name": self.facets["short_name"], - } + if self.facets["project"] != "external": + settings["cmor_check_metadata"] = { + "check_level": self.session["check_level"], + "cmor_table": self.facets["project"], + "mip": self.facets["mip"], + "frequency": self.facets["frequency"], + "short_name": self.facets["short_name"], + } if "timerange" in self.facets: settings["clip_timerange"] = { "timerange": self.facets["timerange"], } - settings["fix_data"] = { - "session": self.session, - **self.facets, - } - settings["cmor_check_data"] = { - "check_level": self.session["check_level"], - "cmor_table": self.facets["project"], - "mip": self.facets["mip"], - "frequency": self.facets["frequency"], - "short_name": self.facets["short_name"], - } + if self.facets["project"] != "external": + settings["fix_data"] = { + "session": self.session, + **self.facets, + } + settings["cmor_check_data"] = { + "check_level": self.session["check_level"], + "cmor_table": self.facets["project"], + "mip": self.facets["mip"], + "frequency": self.facets["frequency"], + "short_name": self.facets["short_name"], + } result: Sequence[PreprocessorItem] = self.files for step, kwargs in settings.items(): diff --git a/esmvalcore/io/xcube.py b/esmvalcore/io/xcube.py new file mode 100644 index 0000000000..abb82888d3 --- /dev/null +++ b/esmvalcore/io/xcube.py @@ -0,0 +1,210 @@ +"""Load data using ``xcube``.""" + +from __future__ import annotations + +import copy +import fnmatch +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any + +import iris.cube +import iris.std_names +import xcube.core.store + +import esmvalcore.io.protocol +from esmvalcore.iris_helpers import dataset_to_iris + +if TYPE_CHECKING: + from esmvalcore.typing import Facets, FacetValue + + +@dataclass +class XCubeDataset(esmvalcore.io.protocol.DataElement): + """A dataset that can be used to load data found using intake-esgf_.""" + + name: str + """A unique name identifying the data.""" + + facets: Facets = field(repr=False) + """Facets are key-value pairs that were used to find this data.""" + + store: xcube.core.store.store.DataStore = field(repr=False) + """The store containing the data.""" + + open_params: dict[str, Any] = field(default_factory=dict, repr=False) + """Parameters to use when opening the data.""" + + _attributes: dict[str, Any] | None = field( + init=False, + repr=False, + default=None, + ) + + def __hash__(self) -> int: + """Return a number uniquely representing the data element.""" + return hash((self.name, self.facets.get("version"))) + + def prepare(self) -> None: + """Prepare the data for access.""" + self.store.preload_data(self.name) + + @property + def attributes(self) -> dict[str, Any]: + """Attributes are key-value pairs describing the data.""" + if self._attributes is None: + msg = ( + "Attributes have not been read yet. Call the `to_iris` method " + "first to read the attributes from the file." + ) + raise ValueError(msg) + return self._attributes + + @attributes.setter + def attributes(self, value: dict[str, Any]) -> None: + self._attributes = value + + def to_iris(self) -> iris.cube.CubeList: + """Load the data as Iris cubes. + + Returns + ------- + : + The loaded data. + """ + dataset = self.store.open_data(self.name, **self.open_params) + # Keep only variables matching the "short_name" facet. + short_names = self.facets.get("short_name", []) + if isinstance(short_names, str | int): + short_names = [str(short_names)] + if short_names: + dataset = dataset[short_names] + + # Drop invalid standard_names. + # TODO: move this to a standalone fixes package. + for data_var in dataset.data_vars.values(): + if ( + "standard_name" in data_var.attrs + and data_var.attrs["standard_name"] + not in iris.std_names.STD_NAMES + ): + data_var.attrs.pop("standard_name") + + # Cache the attributes. + self.attributes = copy.deepcopy(dataset.attrs) + return dataset_to_iris(dataset) + + +@dataclass +class XCubeDataSource(esmvalcore.io.protocol.DataSource): + """Data source for finding files on a local filesystem.""" + + name: str + """A name identifying the data source.""" + + project: str + """The project that the data source provides data for.""" + + priority: int + """The priority of the data source. Lower values have priority.""" + + debug_info: str = field(init=False, repr=False, default="") + """A string containing debug information when no data is found.""" + + data_store_id: str + """Name of the data store.""" + + data_store_params: dict[str, Any] = field(default_factory=dict, repr=False) + """Parameters to use when creating the data store.""" + + open_params: dict[str, Any] = field(default_factory=dict, repr=False) + """Parameters to use when opening the data.""" + + def find_data(self, **facets: FacetValue) -> list[XCubeDataset]: # noqa: C901 + # TODO: fix complexity + """Find data. + + Parameters + ---------- + **facets : + Find data matching these facets. + + Returns + ------- + : + A list of data elements that have been found. + """ + store = xcube.core.store.new_data_store( + self.data_store_id, + **self.data_store_params, + ) + result = [] + requested_short_names = facets.get("short_name", "*") + if isinstance(requested_short_names, str | int): + requested_short_names = [str(requested_short_names)] + requested_datasets = facets.get("dataset", "*") + if isinstance(requested_datasets, str | int): + requested_datasets = [str(requested_datasets)] + available_datasets = store.list_data_ids() + for data_id in available_datasets: + for dataset_pattern in requested_datasets: + if fnmatch.fnmatchcase(data_id, dataset_pattern): + description = store.describe_data(data_id) + available_short_names = list(description.data_vars) + short_names = [ + short_name + for short_name in available_short_names + for short_name_pattern in requested_short_names + if fnmatch.fnmatchcase(short_name, short_name_pattern) + ] + # TODO: Maybe this is too complicated and we should only + # decide which variables to keep/drop after load and conversion + # to iris cube. + open_params = copy.deepcopy(self.open_params) + open_params_schema = store.get_open_data_params_schema() + if "variable_names" in open_params_schema.properties: + open_params["variable_names"] = short_names + elif "drop_variables" in open_params_schema.properties: + drop_variables = { + short_name + for short_name in available_short_names + if short_name not in short_names + } + for coord in description.coords.values(): + if bound_var := coord.attrs.get("bounds"): + drop_variables.remove(bound_var) + for data_var in description.data_vars.values(): + # TODO: keep cell measures + for ancillary_var in data_var.attrs.get( + "ancillary_variables", + "", + ).split(): + drop_variables.remove(ancillary_var) + open_params["drop_variables"] = sorted(drop_variables) + timerange = f"{description.time_range[0]}/{description.time_range[1]}".replace( + "-", + "", + ) + frequencies = { + "P1M": "mon", + } + frequency = frequencies[ + description.attrs["time_coverage_resolution"] + ] + dataset = XCubeDataset( + name=data_id, + facets={ + "dataset": data_id, + "short_name": short_names + if len(short_names) > 1 + else short_names[0], + "frequency": frequency, + "timerange": timerange, + }, + store=store, + open_params=open_params, + ) + dataset.attributes = description.attrs + + result.append(dataset) + + return result diff --git a/pyproject.toml b/pyproject.toml index 25487ece9f..1a986f4de1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ dependencies = [ "stratify>=0.3", "xarray", "yamale", - "zarr>3", + "zarr>2", ] description = "A community tool for pre-processing data from Earth system models in CMIP and running analysis scripts" license = {text = "Apache License, Version 2.0"} From dfa1b832f2f6e32a88b97d3052629b0ac033f20b Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 4 Dec 2025 13:09:25 +0100 Subject: [PATCH 2/5] Relax pin on zarr --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 4f74d6df05..969bb5f900 100644 --- a/environment.yml +++ b/environment.yml @@ -51,7 +51,7 @@ dependencies: - xcube - xcube-cci - yamale - - zarr >3 + - zarr >2 # Python packages needed for building docs - autodocsumm >=0.2.2 - ipython From da6f308dc5cba3dd478ca5af6ce167956c3cdd13 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 4 Dec 2025 13:25:53 +0100 Subject: [PATCH 3/5] Add docs --- doc/api/esmvalcore.io.rst | 1 + doc/api/esmvalcore.io.xcube.rst | 5 +++++ .../configurations/data-xcube-ccizarr.yml | 2 +- esmvalcore/io/xcube.py | 20 ++++++++++++++++--- 4 files changed, 24 insertions(+), 4 deletions(-) create mode 100644 doc/api/esmvalcore.io.xcube.rst diff --git a/doc/api/esmvalcore.io.rst b/doc/api/esmvalcore.io.rst index 831f07c25a..28e5b60277 100644 --- a/doc/api/esmvalcore.io.rst +++ b/doc/api/esmvalcore.io.rst @@ -21,3 +21,4 @@ Submodules esmvalcore.io.intake_esgf esmvalcore.io.local esmvalcore.io.protocol + esmvalcore.io.xcube diff --git a/doc/api/esmvalcore.io.xcube.rst b/doc/api/esmvalcore.io.xcube.rst new file mode 100644 index 0000000000..b192ac0b53 --- /dev/null +++ b/doc/api/esmvalcore.io.xcube.rst @@ -0,0 +1,5 @@ +esmvalcore.io.xcube +=================== + +.. automodule:: esmvalcore.io.xcube + :no-inherited-members: diff --git a/esmvalcore/config/configurations/data-xcube-ccizarr.yml b/esmvalcore/config/configurations/data-xcube-ccizarr.yml index e53e74fc99..c01dfe2977 100644 --- a/esmvalcore/config/configurations/data-xcube-ccizarr.yml +++ b/esmvalcore/config/configurations/data-xcube-ccizarr.yml @@ -1,4 +1,4 @@ -# Read ESA Climate Data Centre (ESA CCI) using xcube. +# Read data from the ESA Climate Data Centre (ESA CCI) using xcube. # https://xcube.readthedocs.io/en/latest/dataaccess.html#esa-climate-data-centre-esa-cci-cciodp-ccizarr-esa-cci-kc projects: external: diff --git a/esmvalcore/io/xcube.py b/esmvalcore/io/xcube.py index abb82888d3..e37bf480d6 100644 --- a/esmvalcore/io/xcube.py +++ b/esmvalcore/io/xcube.py @@ -1,4 +1,14 @@ -"""Load data using ``xcube``.""" +"""Access data using `xcube `_. + +Run the command ``esmvaltool config copy data-xcube-ccizarr.yml`` to update +your :ref:`configuration ` to use this module. This will +create a file with the following content in your configuration directory: + +.. literalinclude:: ../configurations/data-xcube-ccizarr.yml + :language: yaml + :caption: Contents of ``data-xcube-ccizarr.yml`` + +""" from __future__ import annotations @@ -20,7 +30,7 @@ @dataclass class XCubeDataset(esmvalcore.io.protocol.DataElement): - """A dataset that can be used to load data found using intake-esgf_.""" + """A dataset that can be used to load data found using xcube_.""" name: str """A unique name identifying the data.""" @@ -111,7 +121,11 @@ class XCubeDataSource(esmvalcore.io.protocol.DataSource): """A string containing debug information when no data is found.""" data_store_id: str - """Name of the data store.""" + """Name of the data store. + + A list of available data stores can be found in the `xcube documentation + `__. + """ data_store_params: dict[str, Any] = field(default_factory=dict, repr=False) """Parameters to use when creating the data store.""" From e5810ea1f726c00ad165ba4f18a931ff60b9554a Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 4 Dec 2025 13:31:08 +0100 Subject: [PATCH 4/5] Fix config file docstring --- esmvalcore/config/configurations/data-xcube-ccizarr.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/esmvalcore/config/configurations/data-xcube-ccizarr.yml b/esmvalcore/config/configurations/data-xcube-ccizarr.yml index c01dfe2977..abf01b95db 100644 --- a/esmvalcore/config/configurations/data-xcube-ccizarr.yml +++ b/esmvalcore/config/configurations/data-xcube-ccizarr.yml @@ -1,5 +1,6 @@ # Read data from the ESA Climate Data Centre (ESA CCI) using xcube. -# https://xcube.readthedocs.io/en/latest/dataaccess.html#esa-climate-data-centre-esa-cci-cciodp-ccizarr-esa-cci-kc +# More information available at +# https://xcube.readthedocs.io/en/latest/dataaccess.html#esa-climate-data-centre-esa-cci-cciodp-ccizarr-esa-cci-kc. projects: external: data: From 41b068f14c5eb298233ebb11727a0bd3c13a20d6 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Fri, 12 Dec 2025 21:43:30 +0100 Subject: [PATCH 5/5] Undo changes that are no longer necessary --- esmvalcore/config-developer.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/esmvalcore/config-developer.yml b/esmvalcore/config-developer.yml index 4ace8ccc19..32691ab504 100644 --- a/esmvalcore/config-developer.yml +++ b/esmvalcore/config-developer.yml @@ -26,8 +26,6 @@ # cmor_path: ~/my/own/custom_tables ############################################################################### --- -external: - output_file: "{dataset}_{short_name}" CMIP6: cmor_strict: true