From 33c5f79d59d2a43f33cc25bb335c5ec5e8e16e57 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Fri, 11 Apr 2025 17:21:22 +0100 Subject: [PATCH 01/62] First implementation of HadISD integration using ERA5 as template (untested) --- .../pyearthtools/tutorial/HadisdDataClass.py | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py diff --git a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py new file mode 100644 index 00000000..8349a72b --- /dev/null +++ b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py @@ -0,0 +1,151 @@ +# Copyright Commonwealth of Australia, Bureau of Meteorology 2024. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +ECWMF ReAnalysis v5, Low-Resolution / WeatherBench Example + +The purpose of this module is to hold the index class which can be +registered into the pyearthtools package namespace for easy access. + +The code here is the interface between the pyearthtools API and accessing +files on the filesystem. + +An indexer takes some +""" + +from __future__ import annotations + +import functools +from pathlib import Path +from typing import Any, Literal + +import pyearthtools.data +from pyearthtools.data import Petdt +from pyearthtools.data.archive import register_archive +from pyearthtools.data.exceptions import DataNotFoundError +from pyearthtools.data.indexes import ArchiveIndex, decorators +from pyearthtools.data.transforms import Transform, TransformCollection + +from pyearthtools.tutorial.ancilliary.ERA5lowres import ( + ERA5_PRESSURE_VARIABLES, + ERA5_SINGLE_VARIABLES, +) + +# This tells pyearthtools what the actual resolution or time-step of the data is inside the files +ERA_RESOLUTION = (1, "hour") + +# This dictionary tells pyearthtools what variable renames to apply during load +ERA5_RENAME = {"t2m": "2t", "u10": "10u", "v10": "10v", "siconc": "ci"} + +V_TO_PATH = { + "10m_u_component_of_wind": "10m_u_component_of_wind", + "10m_v_component_of_wind": "10m_v_component_of_wind", + "2m_temperature": "2m_temperature", + # "constants": "constants", # FIXME not working + "geopotential": "geopotential", + # "geopotential_500": "geopotential_500", # FIXME not working + "potential_vorticity": "potential_vorticity", + "rh": "relative_humidity", + "specific_humidity": "specific_humidity", + "temperature": "temperature", + # "temperature_850": "temperature_850", # FIXME not working + "toa_incident_solar_radiation": "toa_incident_solar_radiation", + "total_cloud_cover": "total_cloud_cover", + "total_precipitation": "total_precipitation", + "u": "u_component_of_wind", + "v": "v_component_of_wind", + "vorticity": "vorticity", +} + + +@functools.lru_cache() +def cached_iterdir(path: Path) -> list[Path]: + """Run iterdir but cached""" + return list(path.iterdir()) + + +@functools.lru_cache() +def cached_exists(path: Path) -> bool: + """Run exits but cached""" + return path.exists() + + +@register_archive("hadisd", sample_kwargs=dict(station="010010-99999")) +class HadISDIndex(ArchiveIndex): + """HadISD Dataset Index""" + + @property + def _desc_(self): + return { + "singleline": "HadISD Dataset", + "range": "1931-2024", + "Documentation": "https://www.metoffice.gov.uk/hadobs/hadisd/", + } + + def __init__(self, station: str, *, transforms=None): + """ + Setup HadISD Indexer + + Args: + station (str): Station ID to retrieve data for. + transforms (optional): Base transforms to apply. + """ + self.station = station + super().__init__(transforms=transforms) + + def filesystem(self, querytime: str | Petdt) -> Path: + """ + Map a query (station ID and date) to the corresponding file. + + Args: + querytime (str | Petdt): Query date. + + Returns: + Path: Path to the corresponding file. + + Raises: + DataNotFoundError: If the file is not found. + """ + HADISD_HOME = self.ROOT_DIRECTORIES["hadisd"] + + # Convert querytime to Petdt for consistency + querytime = Petdt(querytime) + + # Construct the expected filename pattern + station_id = self.station + date_range = "19310101-20240101" # Hardcoded for now; adjust if needed + version = "hadisd.3.4.0.2023f" + + filename = f"{version}_{date_range}_{station_id}.nc" + file_path = Path(HADISD_HOME) / filename + + # Check if the file exists + if not file_path.exists(): + raise DataNotFoundError( + f"File not found for station: {station_id}, date: {querytime}, path: {file_path}" + ) + + return file_path + + @property + def _import(self): + """module to import for to load this step in an Pipeline""" + return "pyearthtools.tutorial" + + + +# Notes to Joel +# - Does PET have the ability to check a NetCDF file for the variables it contains? +# - If not, we should add that to PET so that a user can be given suggestions for what variables to select, should they give an incorrect variable name From eee0352d76f09d0cec23213bb5407b780d055646 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Fri, 11 Apr 2025 17:21:39 +0100 Subject: [PATCH 02/62] First implementation of HadISD integration using ERA5 as template (untested) --- .../pyearthtools/tutorial/HadisdDataClass.py | 58 ++++++++++++++++++- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py index 8349a72b..49a09868 100644 --- a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py +++ b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py @@ -105,6 +105,7 @@ def __init__(self, station: str, *, transforms=None): self.station = station super().__init__(transforms=transforms) + def filesystem(self, querytime: str | Petdt) -> Path: """ Map a query (station ID and date) to the corresponding file. @@ -123,13 +124,63 @@ def filesystem(self, querytime: str | Petdt) -> Path: # Convert querytime to Petdt for consistency querytime = Petdt(querytime) - # Construct the expected filename pattern + # Determine the parent folder based on the station ID station_id = self.station + wmo_number = station_id[:6] # Extract the first 6 digits of the station ID + + # Define the station ranges and corresponding folders + STATION_RANGES = [ + (0, 29999, "WMO_000000-029999"), + (30000, 49999, "WMO_030000-049999"), + (50000, 79999, "WMO_050000-079999"), + (80000, 99999, "WMO_080000-099999"), + (100000, 149999, "WMO_100000-149999"), + (150000, 199999, "WMO_150000-199999"), + (200000, 249999, "WMO_200000-249999"), + (250000, 299999, "WMO_250000-299999"), + (300000, 349999, "WMO_300000-349999"), + (350000, 399999, "WMO_350000-399999"), + (400000, 449999, "WMO_400000-449999"), + (450000, 499999, "WMO_450000-499999"), + (500000, 549999, "WMO_500000-549999"), + (550000, 599999, "WMO_550000-599999"), + (600000, 649999, "WMO_600000-649999"), + (650000, 699999, "WMO_650000-699999"), + (700000, 709999, "WMO_700000-709999"), + (710000, 714999, "WMO_710000-714999"), + (715000, 719999, "WMO_715000-719999"), + (720000, 721999, "WMO_720000-721999"), + (722000, 722999, "WMO_722000-722999"), + (723000, 723999, "WMO_723000-723999"), + (724000, 724999, "WMO_724000-724999"), + (725000, 725999, "WMO_725000-725999"), + (726000, 726999, "WMO_726000-726999"), + (727000, 729999, "WMO_727000-729999"), + (730000, 799999, "WMO_730000-799999"), + (800000, 849999, "WMO_800000-849999"), + (850000, 899999, "WMO_850000-899999"), + (900000, 949999, "WMO_900000-949999"), + (950000, 999999, "WMO_950000-999999"), + ] + + # Find the parent folder dynamically + parent_folder = None + station_numeric = int(wmo_number) # Convert the WMO number to an integer + for start, end, folder in STATION_RANGES: + if start <= station_numeric <= end: + parent_folder = folder + break + + if parent_folder is None: + raise ValueError(f"Station ID {station_id} does not fall within any defined range.") + + # Construct the expected filename date_range = "19310101-20240101" # Hardcoded for now; adjust if needed version = "hadisd.3.4.0.2023f" + filename = f"{version}_{date_range}_{wmo_number}.nc" - filename = f"{version}_{date_range}_{station_id}.nc" - file_path = Path(HADISD_HOME) / filename + # Construct the full path + file_path = Path(HADISD_HOME) / parent_folder / filename # Check if the file exists if not file_path.exists(): @@ -137,6 +188,7 @@ def filesystem(self, querytime: str | Petdt) -> Path: f"File not found for station: {station_id}, date: {querytime}, path: {file_path}" ) + # Return the constructed file path return file_path @property From da7bdf0ceb9fb4144b424588cd83d5f95dcc646b Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Tue, 15 Apr 2025 10:46:55 +0100 Subject: [PATCH 03/62] Updata HadISDIndex class: Can now successfully load Hadisd data from within PET --- .../pyearthtools/tutorial/HadisdDataClass.py | 51 +++---------------- 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py index 49a09868..2de9961f 100644 --- a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py +++ b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py @@ -38,37 +38,6 @@ from pyearthtools.data.indexes import ArchiveIndex, decorators from pyearthtools.data.transforms import Transform, TransformCollection -from pyearthtools.tutorial.ancilliary.ERA5lowres import ( - ERA5_PRESSURE_VARIABLES, - ERA5_SINGLE_VARIABLES, -) - -# This tells pyearthtools what the actual resolution or time-step of the data is inside the files -ERA_RESOLUTION = (1, "hour") - -# This dictionary tells pyearthtools what variable renames to apply during load -ERA5_RENAME = {"t2m": "2t", "u10": "10u", "v10": "10v", "siconc": "ci"} - -V_TO_PATH = { - "10m_u_component_of_wind": "10m_u_component_of_wind", - "10m_v_component_of_wind": "10m_v_component_of_wind", - "2m_temperature": "2m_temperature", - # "constants": "constants", # FIXME not working - "geopotential": "geopotential", - # "geopotential_500": "geopotential_500", # FIXME not working - "potential_vorticity": "potential_vorticity", - "rh": "relative_humidity", - "specific_humidity": "specific_humidity", - "temperature": "temperature", - # "temperature_850": "temperature_850", # FIXME not working - "toa_incident_solar_radiation": "toa_incident_solar_radiation", - "total_cloud_cover": "total_cloud_cover", - "total_precipitation": "total_precipitation", - "u": "u_component_of_wind", - "v": "v_component_of_wind", - "vorticity": "vorticity", -} - @functools.lru_cache() def cached_iterdir(path: Path) -> list[Path]: @@ -106,12 +75,9 @@ def __init__(self, station: str, *, transforms=None): super().__init__(transforms=transforms) - def filesystem(self, querytime: str | Petdt) -> Path: + def filesystem(self, *args, **kwargs) -> Path: """ - Map a query (station ID and date) to the corresponding file. - - Args: - querytime (str | Petdt): Query date. + Map a query (station ID) to the corresponding file. Returns: Path: Path to the corresponding file. @@ -121,12 +87,9 @@ def filesystem(self, querytime: str | Petdt) -> Path: """ HADISD_HOME = self.ROOT_DIRECTORIES["hadisd"] - # Convert querytime to Petdt for consistency - querytime = Petdt(querytime) - # Determine the parent folder based on the station ID station_id = self.station - wmo_number = station_id[:6] # Extract the first 6 digits of the station ID + wmo_number = station_id[:6] # Extract the first 6 digits of the station ID for determining the WMO number and parent folder # Define the station ranges and corresponding folders STATION_RANGES = [ @@ -175,9 +138,9 @@ def filesystem(self, querytime: str | Petdt) -> Path: raise ValueError(f"Station ID {station_id} does not fall within any defined range.") # Construct the expected filename - date_range = "19310101-20240101" # Hardcoded for now; adjust if needed + date_range = "19310101-20240101" # Hardcoded for now; adjust if dataset is updated version = "hadisd.3.4.0.2023f" - filename = f"{version}_{date_range}_{wmo_number}.nc" + filename = f"{version}_{date_range}_{station_id}.nc" # Construct the full path file_path = Path(HADISD_HOME) / parent_folder / filename @@ -185,7 +148,7 @@ def filesystem(self, querytime: str | Petdt) -> Path: # Check if the file exists if not file_path.exists(): raise DataNotFoundError( - f"File not found for station: {station_id}, date: {querytime}, path: {file_path}" + f"File not found for station: {station_id}, path: {file_path}" ) # Return the constructed file path @@ -201,3 +164,5 @@ def _import(self): # Notes to Joel # - Does PET have the ability to check a NetCDF file for the variables it contains? # - If not, we should add that to PET so that a user can be given suggestions for what variables to select, should they give an incorrect variable name +# - # Convert querytime to Petdt for consistency. Useful for other classes that may not have the same conversion + # querytime = Petdt(querytime) \ No newline at end of file From 45b07ae6dd7a3b7ca21aa748c3febf4ff46b77ad Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Tue, 15 Apr 2025 18:16:54 +0100 Subject: [PATCH 04/62] Create notebook for documenting HadISD dataset integration --- .../tutorial/HadISD_Dataset_Integration.ipynb | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 notebooks/tutorial/HadISD_Dataset_Integration.ipynb diff --git a/notebooks/tutorial/HadISD_Dataset_Integration.ipynb b/notebooks/tutorial/HadISD_Dataset_Integration.ipynb new file mode 100644 index 00000000..110fd529 --- /dev/null +++ b/notebooks/tutorial/HadISD_Dataset_Integration.ipynb @@ -0,0 +1,110 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using the HadISD datset (version 3.4.0.2023f) with PyEarthTools\n", + "HadISD is a global sub-daily dataset based on the ISD dataset from NOAA's NCEI. As well as station selection criteria, a suite of quality control tests has been run on the major climatological variables.\n", + "\n", + "The dataset can be downloaded here: https://www.metoffice.gov.uk/hadobs/hadisd/v340_2023f/download.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import xarray as xr\n", + "import scores\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from torch.utils.data import DataLoader\n", + "from lightning import Trainer, LightningModule\n", + "from lightning.pytorch.callbacks import RichProgressBar\n", + "from rich.progress import track\n", + "\n", + "import pyearthtools.data\n", + "import pyearthtools.tutorial\n", + "import pyearthtools.pipeline\n", + "import pyearthtools.training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# train/validation/test split dates\n", + "train_start = \"1970-01-01T00\"\n", + "train_end = \"2022-12-31T23\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_prep_pipe = pyearthtools.pipeline.Pipeline(\n", + " # pyearthtools.data.archive.hadisd(country_code = \"24\", station_id = \"010010\", nearest_neighbors = 12, lon= 0.0, lat = 0.0), # Geospatial fence \n", + " pyearthtools.data.archive.hadisd(\"010010-99999\", variables = [\"slp\", \"dewpoints\", \"temperatures\"]), # Geospatial fence \n", + " # pyearthtools.data.archive.hadisd(\"010014-99999\"), # Geospatial fence\n", + ")\n", + "data_prep_pipe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = data_prep_pipe[\"1931-01-01T06\"]\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In some cases you will get the following error when passing a date time to a pipeline object: `IndexWarning: Could not find time in dataset to select on. Petdt('1931-01-01T07')`
\n", + "\n", + "This indicates that data for the datetime you chose does not exist. In this case PET will load all data from your station selection." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pet_tutorial", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 3484dd6fdd913d49b050af0f0ddce9c15c06e35b Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Tue, 15 Apr 2025 18:18:47 +0100 Subject: [PATCH 05/62] Update HadISDIndex class so variables can be selected using transforms --- .../pyearthtools/tutorial/HadisdDataClass.py | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py index 2de9961f..a6bf2b25 100644 --- a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py +++ b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py @@ -63,7 +63,13 @@ def _desc_(self): "Documentation": "https://www.metoffice.gov.uk/hadobs/hadisd/", } - def __init__(self, station: str, *, transforms=None): + def __init__( + self, + station: str, + variables: list[str] | str | None = None, # Ensure this is defined + *, + transforms: Transform | TransformCollection | None = None, # Ensure this is keyword-only + ): """ Setup HadISD Indexer @@ -72,7 +78,31 @@ def __init__(self, station: str, *, transforms=None): transforms (optional): Base transforms to apply. """ self.station = station - super().__init__(transforms=transforms) + self.variables = [variables] if isinstance(variables, str) else variables + + + # Define the base transforms + base_transform = TransformCollection() + + # Add a transform to drop unused variables (if variables are provided) REMOVE ONCE TESTED!!!!!! + if variables: + base_transform += pyearthtools.data.transforms.variables.Select(self.variables) + + # Add the variable selection transform, and any other transforms you want to apply + # Future code to do this goes here, but the selcet class for variable doesn't exist yet, should look like this + # if variables: + # base_transform += pyearthtools.data.transforms.variables.Select( + # {var: self.variable for var in ["variable"]}, ignore_missing=True + # ) + + + # Call the parent class's __init__ method + super().__init__( + transforms=base_transform + (transforms or TransformCollection()), + ) + + self.record_initialisation() + def filesystem(self, *args, **kwargs) -> Path: @@ -128,7 +158,7 @@ def filesystem(self, *args, **kwargs) -> Path: # Find the parent folder dynamically parent_folder = None - station_numeric = int(wmo_number) # Convert the WMO number to an integer + station_numeric = int(wmo_number) # Convert the WMO number to an integer (I think WMO is just first 6 digitis of station ID?) for start, end, folder in STATION_RANGES: if start <= station_numeric <= end: parent_folder = folder From ae7b0639755938d818451b23903dacfe19ce81b4 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Tue, 15 Apr 2025 18:20:11 +0100 Subject: [PATCH 06/62] Add small explainer comments --- packages/tutorial/src/pyearthtools/tutorial/ERA5DataClass.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/tutorial/src/pyearthtools/tutorial/ERA5DataClass.py b/packages/tutorial/src/pyearthtools/tutorial/ERA5DataClass.py index 7ec32f44..5e3020b8 100644 --- a/packages/tutorial/src/pyearthtools/tutorial/ERA5DataClass.py +++ b/packages/tutorial/src/pyearthtools/tutorial/ERA5DataClass.py @@ -138,9 +138,10 @@ def __init__( self.level_value = level_value + # Apply level selection if level_value is provided if level_value: base_transform += pyearthtools.data.transforms.coordinates.Select( - {coord: level_value for coord in ["level"]}, ignore_missing=True + {coord: level_value for coord in ["level"]}, ignore_missing=True # Handles case where level is not present eg 2m_temperature ) super().__init__( From 7e4a6c19de41b4cf6fcf6561892b7ca0a2df36ba Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Wed, 16 Apr 2025 11:38:50 +0100 Subject: [PATCH 07/62] Add transform for selecting variables when loading datasets --- .../pyearthtools/data/transforms/variables.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/packages/data/src/pyearthtools/data/transforms/variables.py b/packages/data/src/pyearthtools/data/transforms/variables.py index 7bf6b279..8cf090bc 100644 --- a/packages/data/src/pyearthtools/data/transforms/variables.py +++ b/packages/data/src/pyearthtools/data/transforms/variables.py @@ -92,6 +92,47 @@ def apply(self, dataset: xr.Dataset) -> xr.Dataset: if not var_included: return dataset return dataset[var_included] + + +class Select(Transform): + """Select specific dataset variables""" + + def __init__(self, variables: list[str] | str, *extra_variables): + """ + Select variables from the dataset. + + Args: + variables (list[str] | str): + List of variables to select. + """ + super().__init__() + self.record_initialisation() + + # Ensure variables is always a list + variables = variables if isinstance(variables, (list, tuple)) else [variables] + self._variables = [*variables, *extra_variables] + + def apply(self, dataset: xr.Dataset) -> xr.Dataset: + """ + Apply the transform to select specific variables. + + Args: + dataset (xr.Dataset): The dataset to transform. + + Returns: + xr.Dataset: A dataset containing only the selected variables. + """ + if self._variables is None: + return dataset + + # Select only the variables that exist in the dataset + var_included = set(self._variables) & set(dataset.data_vars) + + if not var_included: + # If no variables match, return the original dataset + return dataset + + return dataset[list(var_included)] @BackwardsCompatibility(Drop) From 38d60f60565914406f8fec117c9dce0d1cb78619 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Wed, 16 Apr 2025 12:06:58 +0100 Subject: [PATCH 08/62] Update class to support selecting variables --- .../src/pyearthtools/tutorial/HadisdDataClass.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py index a6bf2b25..5ce097db 100644 --- a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py +++ b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py @@ -84,17 +84,10 @@ def __init__( # Define the base transforms base_transform = TransformCollection() - # Add a transform to drop unused variables (if variables are provided) REMOVE ONCE TESTED!!!!!! + # Add a transform to select variables (if variables are provided) if variables: base_transform += pyearthtools.data.transforms.variables.Select(self.variables) - # Add the variable selection transform, and any other transforms you want to apply - # Future code to do this goes here, but the selcet class for variable doesn't exist yet, should look like this - # if variables: - # base_transform += pyearthtools.data.transforms.variables.Select( - # {var: self.variable for var in ["variable"]}, ignore_missing=True - # ) - # Call the parent class's __init__ method super().__init__( From ab920100cdc2c4eb48ab7d89da877753675668f1 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Wed, 16 Apr 2025 16:03:19 +0100 Subject: [PATCH 09/62] Add SetMissingToNaN transform class --- .../pyearthtools/data/transforms/values.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/packages/data/src/pyearthtools/data/transforms/values.py b/packages/data/src/pyearthtools/data/transforms/values.py index f85b51bd..21f009ed 100644 --- a/packages/data/src/pyearthtools/data/transforms/values.py +++ b/packages/data/src/pyearthtools/data/transforms/values.py @@ -21,6 +21,7 @@ from typing import Literal import xarray as xr +import numpy as np import pyearthtools.data from pyearthtools.data.transforms.transform import Transform @@ -66,6 +67,27 @@ def apply(self, dataset: xr.Dataset) -> xr.Dataset: if self._direction in ["both", "backward"]: dataset = dataset.bfill(coord, limit=self._limit) return encod(dataset) + + +class SetMissingToNaN(Transform): + """ + Transform to replace specified missing values with NaN for given variables. + + Args: + varname_val_map (dict[str, float]): A dictionary mapping variable names to their missing values. + """ + + def __init__(self, varname_val_map: dict[str, float]): + super().__init__() + self.record_initialisation() + + self.varname_val_map = varname_val_map + + def apply(self, dataset: xr.Dataset) -> xr.Dataset: + for var_name, miss_val in self.varname_val_map.items(): + if var_name in dataset: + dataset[var_name] = dataset[var_name].where(dataset[var_name] != miss_val, np.nan) + return dataset @BackwardsCompatibility(Fill) From d703c3aad4b38998b9ad36fc1a0510c95d49e4a6 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Wed, 16 Apr 2025 16:04:38 +0100 Subject: [PATCH 10/62] Update HadISDIndex Class to make use of new SetMissingToNaN transform --- .../pyearthtools/tutorial/HadisdDataClass.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py index 5ce097db..fa5d4ead 100644 --- a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py +++ b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py @@ -37,6 +37,18 @@ from pyearthtools.data.exceptions import DataNotFoundError from pyearthtools.data.indexes import ArchiveIndex, decorators from pyearthtools.data.transforms import Transform, TransformCollection +from pyearthtools.data.transforms.variables import Drop, Select +from pyearthtools.data.transforms.values import SetMissingToNaN + + +# This dictionary tells pyearthtools which variables have missing values and what those values are. +varname_val_map = { + "total_cloud_cover": -999., + "low_cloud_cover": -999., + "mid_cloud_cover": -999., + "high_cloud_cover": -999. + } +# TODO:Check that these values actually represent missing values in the dataset @functools.lru_cache() @@ -80,14 +92,15 @@ def __init__( self.station = station self.variables = [variables] if isinstance(variables, str) else variables - # Define the base transforms base_transform = TransformCollection() + base_transform += Drop("reporting_stats" ) # Add a transform to select variables (if variables are provided) if variables: - base_transform += pyearthtools.data.transforms.variables.Select(self.variables) + base_transform += Select(self.variables) + base_transform += SetMissingToNaN(varname_val_map) # Call the parent class's __init__ method super().__init__( @@ -97,7 +110,6 @@ def __init__( self.record_initialisation() - def filesystem(self, *args, **kwargs) -> Path: """ Map a query (station ID) to the corresponding file. From 1a6e8c6d26d5ca26dcd547b7d7db00a901631e5d Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Wed, 16 Apr 2025 16:05:35 +0100 Subject: [PATCH 11/62] Update HadISD note book to reflect addition of new transforms --- .../tutorial/HadISD_Dataset_Integration.ipynb | 109 ++++++++++++++---- 1 file changed, 85 insertions(+), 24 deletions(-) diff --git a/notebooks/tutorial/HadISD_Dataset_Integration.ipynb b/notebooks/tutorial/HadISD_Dataset_Integration.ipynb index 110fd529..bc5756b3 100644 --- a/notebooks/tutorial/HadISD_Dataset_Integration.ipynb +++ b/notebooks/tutorial/HadISD_Dataset_Integration.ipynb @@ -16,24 +16,9 @@ "metadata": {}, "outputs": [], "source": [ - "import sys\n", - "from pathlib import Path\n", - "\n", - "import numpy as np\n", - "import xarray as xr\n", - "import scores\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "from torch.utils.data import DataLoader\n", - "from lightning import Trainer, LightningModule\n", - "from lightning.pytorch.callbacks import RichProgressBar\n", - "from rich.progress import track\n", - "\n", - "import pyearthtools.data\n", - "import pyearthtools.tutorial\n", - "import pyearthtools.pipeline\n", - "import pyearthtools.training" + "import pyearthtools.pipeline as petpipe\n", + "import pyearthtools.data as petdata\n", + "import pyearthtools.tutorial\n" ] }, { @@ -53,10 +38,28 @@ "metadata": {}, "outputs": [], "source": [ - "data_prep_pipe = pyearthtools.pipeline.Pipeline(\n", - " # pyearthtools.data.archive.hadisd(country_code = \"24\", station_id = \"010010\", nearest_neighbors = 12, lon= 0.0, lat = 0.0), # Geospatial fence \n", - " pyearthtools.data.archive.hadisd(\"010010-99999\", variables = [\"slp\", \"dewpoints\", \"temperatures\"]), # Geospatial fence \n", - " # pyearthtools.data.archive.hadisd(\"010014-99999\"), # Geospatial fence\n", + "varname_val_map = {\n", + " \"total_cloud_cover\": -888., \n", + " \"low_cloud_cover\": -999., \n", + " \"mid_cloud_cover\": -999.,\n", + " \"high_cloud_cover\": -999.\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_prep_pipe = petpipe.Pipeline(\n", + " petdata.archive.hadisd((\"010010-99999\"), variables = [\"total_cloud_cover\", \"temperatures\"]),\n", + " petdata.transforms.values.SetMissingToNaN(varname_val_map),\n", + " # petdata.transforms.variables.Drop([\"station_id\", \"input_station_id\"]),\n", + " # petpipe.operations.xarray.conversion.ToNumpy(),\n", + " # petdata.archive.hadisd(\"010014-99999\", variables = [\"slp\", \"dewpoints\", \"temperatures\", \"station_id\"]),\n", + " # petdata.archive.hadisd(country_code = \"24\", station_id = \"010010\", nearest_neighbors = 12, lon= 0.0, lat = 0.0), # Geospatial fence \n", + " # petdata.archive.hadisd(\"010014-99999\"), # Geospatial fence\n", ")\n", "data_prep_pipe" ] @@ -67,7 +70,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds = data_prep_pipe[\"1931-01-01T06\"]\n", + "ds = data_prep_pipe[\"1931-01-01T07\"]\n", "ds" ] }, @@ -81,8 +84,66 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# show total_cloud_cover from ds\n", + "tcc = ds[\"total_cloud_cover\"]\n", + "tcc" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "from pyearthtools.data.transforms.values import SetMissingToNaN\n", + "import numpy as np\n", + "\n", + "def test_set_missing_to_nan():\n", + " data = xr.Dataset({\n", + " \"total_cloud_cover\": (\"time\", [0, -999, 50]),\n", + " \"low_cloud_cover\": (\"time\", [10, -999, 20]),\n", + " })\n", + "\n", + " varname_val_map = {\n", + " \"total_cloud_cover\": -99.0,\n", + " \"low_cloud_cover\": -999.0,\n", + " }\n", + "\n", + " transform = SetMissingToNaN(varname_val_map)\n", + " transformed_data = transform.apply(data)\n", + "\n", + " if np.isnan(transformed_data[\"total_cloud_cover\"].data[1]):\n", + " print(\"Total cloud cover at index 1 is NaN\")\n", + " else:\n", + " print(\"Total cloud cover at index 1 is not NaN\")\n", + " \n", + " if np.isnan(transformed_data[\"low_cloud_cover\"].data[1]):\n", + " print(\"Low cloud cover at index 1 is NaN\")\n", + " \n", + " if transformed_data[\"total_cloud_cover\"].data[2] == 50:\n", + " print(\"Total cloud cover at index 2 is 50\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_set_missing_to_nan()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [] } ], From 30cb7211097b1a72ed9ebdab5043cc6d38409748 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Wed, 16 Apr 2025 16:06:21 +0100 Subject: [PATCH 12/62] Add new test for SetMissingToNaN transform --- packages/data/tests/transform/test_values.py | 21 ++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 packages/data/tests/transform/test_values.py diff --git a/packages/data/tests/transform/test_values.py b/packages/data/tests/transform/test_values.py new file mode 100644 index 00000000..77b6ee42 --- /dev/null +++ b/packages/data/tests/transform/test_values.py @@ -0,0 +1,21 @@ +import xarray as xr +from pyearthtools.data.transforms.values import SetMissingToNaN +import numpy as np + +def test_set_missing_to_nan(): + data = xr.Dataset({ + "total_cloud_cover": ("time", [0, -999, 50]), + "low_cloud_cover": ("time", [10, -999, 20]), + }) + + varname_val_map = { + "total_cloud_cover": -999.0, + "low_cloud_cover": -999.0, + } + + transform = SetMissingToNaN(varname_val_map) + transformed_data = transform.apply(data) + + assert np.isnan(transformed_data["total_cloud_cover"].data[1]) + assert np.isnan(transformed_data["low_cloud_cover"].data[1]) + assert transformed_data["total_cloud_cover"].data[2] == 50 \ No newline at end of file From 47c0206cdb88b915ec3debf66b6929f622af1b4d Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Thu, 24 Apr 2025 10:03:24 +0100 Subject: [PATCH 13/62] Create add_flagged_obs class in values.py module --- .../pyearthtools/data/transforms/values.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/packages/data/src/pyearthtools/data/transforms/values.py b/packages/data/src/pyearthtools/data/transforms/values.py index 21f009ed..f28d0284 100644 --- a/packages/data/src/pyearthtools/data/transforms/values.py +++ b/packages/data/src/pyearthtools/data/transforms/values.py @@ -90,6 +90,53 @@ def apply(self, dataset: xr.Dataset) -> xr.Dataset: return dataset +class AddFlaggedObs(Transform): + """ + Transform to restore flagged observations removed by QC tests back into the dataset. + + Args: + flagged_labels (list[str]): A list of variable names corresponding to flagged observations. + """ + + def __init__(self, flagged_labels: list[str]): + super().__init__() + self.record_initialisation() + self.flagged_labels = flagged_labels + + def apply(self, dataset: xr.Dataset) -> xr.Dataset: + """ + Apply the transformation to restore flagged observations. + + Args: + dataset (xr.Dataset): The input dataset containing flagged_obs and QC'd variables. + + Returns: + xr.Dataset: The dataset with flagged observations restored. + """ + # Attach a coordinate to the flagged dimension with descriptive labels + dataset = dataset.assign_coords(flagged=("flagged", self.flagged_labels)) + + # Iterate through flagged variables and restore flagged data + for var_name in dataset["flagged"].data: + flagged_var = dataset["flagged_obs"].sel(flagged=var_name) + qcd_var = dataset[var_name] + + # Fill NaNs in flagged variable with data from QC'd variable + dataset[var_name].data = flagged_var.fillna(qcd_var).data + + # # not all flagged values are available in flagged obs. Why? + # # TODO: understand why and replace with observations if possible + # dataset[var_name][dataset[var_name] == dataset[var_name].attrs['flagged_value']] = np.nan + + # Replace flagged placeholder values with NaN + if "flagged_value" in dataset[var_name].attrs: + # Compute the boolean condition to avoid Dask-related issues + mask = (dataset[var_name] == dataset[var_name].attrs["flagged_value"]).compute() + dataset[var_name] = dataset[var_name].where(~mask, np.nan) + + return dataset + + @BackwardsCompatibility(Fill) def fill(*args, **kwargs): ... From dd7bebae64590d3c8cc0e136d323216b5f355c56 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Thu, 24 Apr 2025 10:57:09 +0100 Subject: [PATCH 14/62] Update Hadisd notebook --- .../tutorial/HadISD_Dataset_Integration.ipynb | 1525 ++++++++++++++++- 1 file changed, 1516 insertions(+), 9 deletions(-) diff --git a/notebooks/tutorial/HadISD_Dataset_Integration.ipynb b/notebooks/tutorial/HadISD_Dataset_Integration.ipynb index bc5756b3..baa92779 100644 --- a/notebooks/tutorial/HadISD_Dataset_Integration.ipynb +++ b/notebooks/tutorial/HadISD_Dataset_Integration.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -40,7 +40,7 @@ "source": [ "varname_val_map = {\n", " \"total_cloud_cover\": -888., \n", - " \"low_cloud_cover\": -999., \n", + " \"low_cloud_cover\": -888., \n", " \"mid_cloud_cover\": -999.,\n", " \"high_cloud_cover\": -999.\n", " }" @@ -48,9 +48,468 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Pipeline\n",
+       "\tDescription                    `pyearthtools.pipeline` Data Pipeline\n",
+       "\n",
+       "\n",
+       "\tInitialisation                 \n",
+       "\t\t exceptions_to_ignore           None\n",
+       "\t\t iterator                       None\n",
+       "\t\t sampler                        None\n",
+       "\tSteps                          \n",
+       "\t\t HadisdDataClass.HadISDIndex    {'HadISDIndex': {'station': "'010010-99999'", 'variables': "['total_cloud_cover', 'temperatures']"}}\n",
+       "\t\t values.SetMissingToNaN         {'SetMissingToNaN': {'varname_val_map': {'total_cloud_cover': '-888.0', 'low_cloud_cover': '-999.0', 'mid_cloud_cover': '-999.0', 'high_cloud_cover': '-999.0'}}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Graph

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "HadISDIndex_fd44edc8-ed77-4327-84bf-71f5366422f3\n", + "\n", + "HadisdDataClass.HadISDIndex\n", + "\n", + "\n", + "\n", + "SetMissingToNaN_80a8dc42-7ba3-45c4-8c61-84a1c4a83894\n", + "\n", + "values.SetMissingToNaN\n", + "\n", + "\n", + "\n", + "HadISDIndex_fd44edc8-ed77-4327-84bf-71f5366422f3->SetMissingToNaN_80a8dc42-7ba3-45c4-8c61-84a1c4a83894\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "data_prep_pipe = petpipe.Pipeline(\n", " petdata.archive.hadisd((\"010010-99999\"), variables = [\"total_cloud_cover\", \"temperatures\"]),\n", @@ -66,9 +525,582 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/joelmiller/Projects/Python/PyEarthTools_Fork/PyEarthTools/packages/data/src/pyearthtools/data/indexes/indexes.py:485: IndexWarning: Could not find time in dataset to select on. Petdt('1931-01-01T07')\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 7MB\n",
+       "Dimensions:            (time: 276194)\n",
+       "Coordinates:\n",
+       "  * time               (time) datetime64[ns] 2MB 1931-01-01T06:00:00 ... 2023...\n",
+       "Data variables:\n",
+       "    temperatures       (time) float64 2MB dask.array<chunksize=(276194,), meta=np.ndarray>\n",
+       "    total_cloud_cover  (time) float64 2MB dask.array<chunksize=(276194,), meta=np.ndarray>\n",
+       "Attributes: (12/39)\n",
+       "    title:                       HadISD\n",
+       "    institution:                 Met Office Hadley Centre, Exeter, UK\n",
+       "    source:                      HadISD data product\n",
+       "    references:                  Dunn, 2019, Met Office Hadley Centre Technic...\n",
+       "    creator_name:                Robert Dunn\n",
+       "    creator_url:                 www.metoffice.gov.uk\n",
+       "    ...                          ...\n",
+       "    station_information:         Where station is a composite the station id ...\n",
+       "    Conventions:                 CF-1.6\n",
+       "    Metadata_Conventions:        Unidata Dataset Discovery v1.0, CF Discrete ...\n",
+       "    featureType:                 timeSeries\n",
+       "    processing_date:             08-Jan-2024\n",
+       "    history:                     Created by mk_netcdf_files.py \\nDuplicate Mo...
" + ], + "text/plain": [ + " Size: 7MB\n", + "Dimensions: (time: 276194)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 2MB 1931-01-01T06:00:00 ... 2023...\n", + "Data variables:\n", + " temperatures (time) float64 2MB dask.array\n", + " total_cloud_cover (time) float64 2MB dask.array\n", + "Attributes: (12/39)\n", + " title: HadISD\n", + " institution: Met Office Hadley Centre, Exeter, UK\n", + " source: HadISD data product\n", + " references: Dunn, 2019, Met Office Hadley Centre Technic...\n", + " creator_name: Robert Dunn\n", + " creator_url: www.metoffice.gov.uk\n", + " ... ...\n", + " station_information: Where station is a composite the station id ...\n", + " Conventions: CF-1.6\n", + " Metadata_Conventions: Unidata Dataset Discovery v1.0, CF Discrete ...\n", + " featureType: timeSeries\n", + " processing_date: 08-Jan-2024\n", + " history: Created by mk_netcdf_files.py \\nDuplicate Mo..." + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ds = data_prep_pipe[\"1931-01-01T07\"]\n", "ds" @@ -85,9 +1117,484 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'total_cloud_cover' (time: 276194)> Size: 2MB\n",
+       "dask.array<where, shape=(276194,), dtype=float64, chunksize=(276194,), chunktype=numpy.ndarray>\n",
+       "Coordinates:\n",
+       "  * time     (time) datetime64[ns] 2MB 1931-01-01T06:00:00 ... 2023-12-31T23:...\n",
+       "Attributes:\n",
+       "    long_name:      Total cloud cover (oktas)\n",
+       "    units:          1\n",
+       "    flagged_value:  -888\n",
+       "    valid_min:      0\n",
+       "    valid_max:      8\n",
+       "    standard_name:  cloud_area_fraction\n",
+       "    cell_methods:   latitude: longitude: time: point (derived in priority ord...
" + ], + "text/plain": [ + " Size: 2MB\n", + "dask.array\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 2MB 1931-01-01T06:00:00 ... 2023-12-31T23:...\n", + "Attributes:\n", + " long_name: Total cloud cover (oktas)\n", + " units: 1\n", + " flagged_value: -888\n", + " valid_min: 0\n", + " valid_max: 8\n", + " standard_name: cloud_area_fraction\n", + " cell_methods: latitude: longitude: time: point (derived in priority ord..." + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# show total_cloud_cover from ds\n", "tcc = ds[\"total_cloud_cover\"]\n", From 1360631a30b2fa0d77fdc9e8afea2f42f1edd3ce Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Thu, 24 Apr 2025 11:36:20 +0100 Subject: [PATCH 15/62] Add ReIndexTime transform to coordinates.py module --- .../data/transforms/coordinates.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/packages/data/src/pyearthtools/data/transforms/coordinates.py b/packages/data/src/pyearthtools/data/transforms/coordinates.py index 95c445c2..84935bab 100644 --- a/packages/data/src/pyearthtools/data/transforms/coordinates.py +++ b/packages/data/src/pyearthtools/data/transforms/coordinates.py @@ -21,6 +21,7 @@ import xarray as xr import numpy as np +import pandas as pd import pyearthtools.data @@ -206,6 +207,35 @@ def apply(self, dataset: xr.Dataset): return dataset +class ReIndexTime(Transform): + """Reindex the time coordinate to a complete hourly time series.""" +# TODO look into using Petdt for this + + def __init__(self, date_range: tuple[str | pd.Timestamp, str | pd.Timestamp], freq: str = "h"): + """ + Args: + date_range (tuple[str | pd.Timestamp, str | pd.Timestamp]): + Start and end dates for the desired time range. + """ + super().__init__() + self.record_initialisation() + self.date_range = date_range + self.freq = freq + + def apply(self, dataset: xr.Dataset) -> xr.Dataset: + """ + Apply the transform to reindex the time coordinate. + + Args: + dataset (xr.Dataset): The dataset to transform. + + Returns: + xr.Dataset: The dataset with the time coordinate reindexed. + """ + dataset = dataset.load() # Ensure the dataset is fully loaded + date_obj = pd.date_range(self.date_range[0], self.date_range[1], freq=self.freq) + return dataset.reindex({"time": date_obj}) + class StandardCoordinateNames(Transform): """Convert xr.Dataset Coordinate Names into Standard Naming Scheme""" From ab870fe7e909f7a51a1adc90b0d593b829a444d5 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Wed, 30 Apr 2025 10:18:44 +0100 Subject: [PATCH 16/62] Minor Hadisd changes --- .../tutorial/HadISD_Dataset_Integration.ipynb | 3050 +++++++++++------ .../pyearthtools/tutorial/HadisdDataClass.py | 154 +- 2 files changed, 2070 insertions(+), 1134 deletions(-) diff --git a/notebooks/tutorial/HadISD_Dataset_Integration.ipynb b/notebooks/tutorial/HadISD_Dataset_Integration.ipynb index baa92779..50d32ed4 100644 --- a/notebooks/tutorial/HadISD_Dataset_Integration.ipynb +++ b/notebooks/tutorial/HadISD_Dataset_Integration.ipynb @@ -16,6 +16,8 @@ "metadata": {}, "outputs": [], "source": [ + "import datetime\n", + "\n", "import pyearthtools.pipeline as petpipe\n", "import pyearthtools.data as petdata\n", "import pyearthtools.tutorial\n" @@ -39,11 +41,250 @@ "outputs": [], "source": [ "varname_val_map = {\n", - " \"total_cloud_cover\": -888., \n", - " \"low_cloud_cover\": -888., \n", + " \"total_cloud_cover\": -999., \n", + " \"low_cloud_cover\": -999., \n", " \"mid_cloud_cover\": -999.,\n", " \"high_cloud_cover\": -999.\n", - " }" + " }\n", + "\n", + "# Probably sensible to accetp a list since some variables have multiple values\n", + "# This would look like:\n", + "# varname_val_map = {\n", + "# \"total_cloud_cover\": [-888., -999.],\n", + "# \"low_cloud_cover\": [-888., -999.],\n", + "# \"mid_cloud_cover\": [-999.],\n", + "# \"high_cloud_cover\": [-999.]\n", + "# }\n", + "# This is a list of variables to be used in the pipeline\n", + "# and the corresponding values to be masked\n", + "# in the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "flagged_labels = [\n", + " 'temperatures', 'dewpoints', 'slp',\n", + " 'stnlp', 'windspeeds', 'winddirs', \n", + " 'total_cloud_cover', 'low_cloud_cover', 'mid_cloud_cover', \n", + " 'high_cloud_cover', 'precip1_depth', 'precip2_depth', \n", + " 'precip3_depth', 'precip6_depth', 'precip9_depth',\n", + " 'precip12_depth', 'precip15_depth', 'precip18_depth', \n", + " 'precip24_depth'\n", + " ]\n", + "\n", + "date_range=(datetime.datetime(1986,11,20,12,0), datetime.datetime(1986,11,21,0,0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_prep_pipe = petpipe.Pipeline(\n", + " # petdata.archive.hadisd((\"010010-99999\"), variables = [\"total_cloud_cover\", \"temperatures\", \"flagged_obs\"]),\n", + " # petdata.transforms.values.SetMissingToNaN(varname_val_map),\n", + " petdata.archive.hadisd([\"010014-99999\", \"010010-99999\", \"010030-99999\"]), \n", + " petdata.transforms.values.AddFlaggedObs(flagged_labels),\n", + " petdata.transforms.values.SetMissingToNaN(varname_val_map),\n", + " # petdata.transforms.coordinates.ReIndexTime(date_range),\n", + "\n", + " # petdata.archive.hadisd(\"010014-99999\", variables = [\"slp\", \"dewpoints\", \"temperatures\", \"station_id\"]),\n", + " # petdata.archive.hadisd(country_code = \"24\", station_id = \"010010\", nearest_neighbors = 12, lon= 0.0, lat = 0.0), # Geospatial fence \n", + " # petdata.archive.hadisd(\"010014-99999\"), # Geospatial fence\n", + ")\n", + "data_prep_pipe\n", + "\n", + "# Don't know what combination of stations you want when selecting nearest neighbours, so want to cahce on a per-station basis\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = data_prep_pipe[\"1986-11-20T12\"]\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In some cases you will get the following error when passing a date time to a pipeline object: `IndexWarning: Could not find time in dataset to select on. Petdt('1931-01-01T07')`
\n", + "\n", + "This indicates that data for the datetime you chose does not exist. In this case PET will load all data from your station selection." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# show total_cloud_cover from ds\n", + "tcc = ds[\"total_cloud_cover\"]\n", + "tcc" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Use the section below to build tests to check that all pipeline steps are working as expected" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test SetMissingToNaN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "from pyearthtools.data.transforms.values import SetMissingToNaN\n", + "import numpy as np\n", + "\n", + "def test_set_missing_to_nan():\n", + " data = xr.Dataset({\n", + " \"total_cloud_cover\": (\"time\", [0, -999, 50]),\n", + " \"low_cloud_cover\": (\"time\", [10, -999, 20]),\n", + " })\n", + "\n", + " varname_val_map = {\n", + " \"total_cloud_cover\": -99.0,\n", + " \"low_cloud_cover\": -999.0,\n", + " }\n", + "\n", + " transform = SetMissingToNaN(varname_val_map)\n", + " transformed_data = transform.apply(data)\n", + "\n", + " if np.isnan(transformed_data[\"total_cloud_cover\"].data[1]):\n", + " print(\"Total cloud cover at index 1 is NaN\")\n", + " else:\n", + " print(\"Total cloud cover at index 1 is not NaN\")\n", + " \n", + " if np.isnan(transformed_data[\"low_cloud_cover\"].data[1]):\n", + " print(\"Low cloud cover at index 1 is NaN\")\n", + " \n", + " if transformed_data[\"total_cloud_cover\"].data[2] == 50:\n", + " print(\"Total cloud cover at index 2 is 50\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_set_missing_to_nan()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test AddFlaggedObs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "import numpy as np\n", + "from pyearthtools.data.transforms.values import AddFlaggedObs\n", + "\n", + "def test_add_flagged_obs():\n", + " # Mock dataset with flagged observations\n", + " data = xr.Dataset(\n", + " {\n", + " \"temperatures\": ((\"time\",), [np.nan, 15.0, np.nan]),\n", + " \"dewpoints\": ((\"time\",), [np.nan, 10.0, np.nan]),\n", + " \"flagged_obs\": (\n", + " (\"time\", \"flagged\"),\n", + " [\n", + " [20.0, np.nan], # Flagged data for time=0\n", + " [np.nan, np.nan], # No flagged data for time=1\n", + " [25.0, 12.0], # Flagged data for time=2\n", + " ],\n", + " ),\n", + " },\n", + " coords={\n", + " \"time\": [0, 1, 2],\n", + " \"flagged\": [0, 1], # Indices for flagged variables\n", + " },\n", + " )\n", + "\n", + " # Add flagged_value attribute to variables\n", + " data[\"temperatures\"].attrs[\"flagged_value\"] = -999.0\n", + " data[\"dewpoints\"].attrs[\"flagged_value\"] = -999.0\n", + "\n", + " # Define flagged labels corresponding to the flagged dimension\n", + " flagged_labels = [\"temperatures\", \"dewpoints\"]\n", + "\n", + " # Apply the AddFlaggedObs transform\n", + " transform = AddFlaggedObs(flagged_labels)\n", + " transformed_data = transform.apply(data)\n", + "\n", + " # Assert that flagged data has been restored\n", + " assert np.allclose(\n", + " transformed_data[\"temperatures\"].data, [20.0, 15.0, 25.0], equal_nan=True\n", + " )\n", + " assert np.allclose(\n", + " transformed_data[\"dewpoints\"].data, [np.nan, 10.0, 12.0], equal_nan=True\n", + " )\n", + "\n", + " print(\"Test passed: Flagged observations were correctly restored.\")\n", + "\n", + "# Run the test\n", + "test_add_flagged_obs()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'010010-99999': PosixPath('/Users/joelmiller/Projects/data/hadisd/WMO_000000-029999/hadisd.3.4.0.2023f_19310101-20240101_010010-99999.nc')}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pyearthtools.tutorial.HadisdDataClass import HadISDIndex\n", + "\n", + "hadisd = HadISDIndex([\"010010-99999\"])\n", + "paths = hadisd.filesystem([\"010010-99999\"])\n", + "paths\n" ] }, { @@ -55,351 +296,340 @@ "data": { "text/html": [ "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Pipeline\n",
-       "\tDescription                    `pyearthtools.pipeline` Data Pipeline\n",
+       "
<xarray.Dataset> Size: 252MB\n",
+       "Dimensions:                (coordinate_length: 1, time: 276194, test: 71,\n",
+       "                            flagged: 19, reporting_v: 19, reporting_t: 1116,\n",
+       "                            reporting_2: 2)\n",
+       "Coordinates:\n",
+       "    longitude              (coordinate_length) float64 8B dask.array<chunksize=(1,), meta=np.ndarray>\n",
+       "    latitude               (coordinate_length) float64 8B dask.array<chunksize=(1,), meta=np.ndarray>\n",
+       "    elevation              (coordinate_length) float64 8B dask.array<chunksize=(1,), meta=np.ndarray>\n",
+       "  * time                   (time) datetime64[ns] 2MB 1931-01-01T06:00:00 ... ...\n",
+       "Dimensions without coordinates: coordinate_length, test, flagged, reporting_v,\n",
+       "                                reporting_t, reporting_2\n",
+       "Data variables: (12/27)\n",
+       "    station_id             |S12 12B ...\n",
+       "    temperatures           (time) float64 2MB dask.array<chunksize=(276194,), meta=np.ndarray>\n",
+       "    dewpoints              (time) float64 2MB dask.array<chunksize=(276194,), meta=np.ndarray>\n",
+       "    slp                    (time) float64 2MB dask.array<chunksize=(276194,), meta=np.ndarray>\n",
+       "    stnlp                  (time) float64 2MB dask.array<chunksize=(276194,), meta=np.ndarray>\n",
+       "    windspeeds             (time) float64 2MB dask.array<chunksize=(276194,), meta=np.ndarray>\n",
+       "    ...                     ...\n",
+       "    wind_gust              (time) float64 2MB dask.array<chunksize=(276194,), meta=np.ndarray>\n",
+       "    past_sigwx1            (time) float64 2MB dask.array<chunksize=(276194,), meta=np.ndarray>\n",
+       "    input_station_id       (time) object 2MB dask.array<chunksize=(276194,), meta=np.ndarray>\n",
+       "    quality_control_flags  (time, test) float64 157MB dask.array<chunksize=(69049, 18), meta=np.ndarray>\n",
+       "    flagged_obs            (time, flagged) float64 42MB dask.array<chunksize=(138097, 10), meta=np.ndarray>\n",
+       "    reporting_stats        (reporting_v, reporting_t, reporting_2) float64 339kB dask.array<chunksize=(19, 1116, 2), meta=np.ndarray>\n",
+       "Attributes: (12/39)\n",
+       "    title:                       HadISD\n",
+       "    institution:                 Met Office Hadley Centre, Exeter, UK\n",
+       "    source:                      HadISD data product\n",
+       "    references:                  Dunn, 2019, Met Office Hadley Centre Technic...\n",
+       "    creator_name:                Robert Dunn\n",
+       "    creator_url:                 www.metoffice.gov.uk\n",
+       "    ...                          ...\n",
+       "    station_information:         Where station is a composite the station id ...\n",
+       "    Conventions:                 CF-1.6\n",
+       "    Metadata_Conventions:        Unidata Dataset Discovery v1.0, CF Discrete ...\n",
+       "    featureType:                 timeSeries\n",
+       "    processing_date:             08-Jan-2024\n",
+       "    history:                     Created by mk_netcdf_files.py \\nDuplicate Mo...