Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
ee14f2c
Change get_all_station_ids method to work with new zarr structure
millerjoel Jul 21, 2025
ef1fc30
Update filesystem method to work with simplified file structure
millerjoel Jul 21, 2025
6693b7c
Add functionality to loop over station rnges for downloads
millerjoel Jul 22, 2025
d50c0d2
Make downloading and extraction even more idempotent. Final cell of d…
millerjoel Jul 22, 2025
44b819c
Improve conversion efficency
millerjoel Jul 23, 2025
6f976d5
Tidy station list
millerjoel Jul 28, 2025
d894f7f
Improve notebook descriptions
millerjoel Jul 28, 2025
026a6d8
Ruff compliance added
Aug 11, 2025
c3a7ee7
Merge branch 'ACCESS-Community-Hub:develop' into hadisd-simplify-146
millerjoel Aug 14, 2025
05e0321
Update Drop transform for more robust handling
millerjoel Aug 29, 2025
f1ae57f
Update download notebook to show good output
millerjoel Aug 29, 2025
56434c9
Update conversion to zarr notebook to show good output
millerjoel Aug 29, 2025
d0eb728
Add docstrings
millerjoel Aug 29, 2025
217a220
Merge branch 'ACCESS-Community-Hub:develop' into hadisd-simplify-146
millerjoel Sep 1, 2025
78e9e19
Improve error message for numpy conversion process
tennlee Sep 2, 2025
23a9851
Run again with good output after variable/numpy fix
millerjoel Sep 2, 2025
4e8d251
Update pre-processing to remove redundant station_id variable causing…
millerjoel Sep 2, 2025
92a1908
Update pre-processing to remove redundant station_id variable causing…
millerjoel Sep 2, 2025
07ef03c
Update notebook to plot confusion matrix with matplotlib and adjust p…
millerjoel Sep 2, 2025
35531d4
Skip radar projection tests if radar data is unavailable
tennlee Sep 3, 2025
a407506
Run code reformatting
tennlee Sep 3, 2025
5241085
Address code linting tool feedback
tennlee Sep 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
387 changes: 274 additions & 113 deletions notebooks/tutorial/HadISD/1_HadISD_Download.ipynb

Large diffs are not rendered by default.

3,662 changes: 3,581 additions & 81 deletions notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb

Large diffs are not rendered by default.

862 changes: 830 additions & 32 deletions notebooks/tutorial/HadISD/3_HadISD_XGBoost_Pipeline.ipynb

Large diffs are not rendered by default.

57 changes: 36 additions & 21 deletions notebooks/tutorial/HadISD/Data_Config.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -57,27 +57,42 @@
"metadata": {},
"outputs": [],
"source": [
"# A sample list of WMO number ranges. Users can find more at the official HadISD download page.\n",
"# For any station ranges you don't want to download, you can comment them out here\n",
"wmo_id_ranges = [\n",
" \"000000-029999\",\n",
" \"080000-099999\",\n",
" \"200000-249999\",\n",
" \"720000-721999\",\n",
" #\"000000-029999\",\n",
" #\"030000-049999\",\n",
" #\"050000-079999\",\n",
" #\"080000-099999\",\n",
" #\"100000-149999\",\n",
" #\"150000-199999\",\n",
" #\"200000-249999\",\n",
" #\"250000-299999\",\n",
" #\"300000-349999\",\n",
" #\"350000-399999\",\n",
" #\"400000-449999\",\n",
" #\"450000-499999\",\n",
" \"500000-549999\",\n",
" #\"550000-599999\",\n",
" #\"600000-649999\",\n",
" #\"650000-699999\",\n",
" #\"700000-709999\",\n",
" #\"710000-714999\",\n",
" #\"715000-719999\",\n",
" #\"720000-721999\",\n",
" \"722000-722999\",\n",
" #\"723000-723999\",\n",
" #\"724000-724999\",\n",
" #\"725000-725999\",\n",
" #\"726000-726999\",\n",
" #\"727000-729999\",\n",
" #\"730000-799999\",\n",
" \"800000-849999\",\n",
" #\"850000-899999\",\n",
" #\"900000-949999\",\n",
" #\"950000-999999\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "35617ad2",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# User sets the WMO number range to download\n",
"wmo_id_range = \"080000-099999\" # Change this to the desired WMO range, either from the sample list or from the HadISD page."
]
},
{
"cell_type": "markdown",
"id": "7aec321a",
Expand All @@ -97,15 +112,15 @@
"# Set the date range to reindex the time coordinate\n",
"DATE_RANGE = (\"1970-01-01T00\", \"2023-12-31T23\")\n",
"# Set the input directory to the folder with raw NetCDFs\n",
"input_dir = download_dir / f\"WMO_{wmo_id_range}\" / \"netcdf\"\n",
"input_dir = download_dir / \"netcdf\"\n",
"# Set the Zarr output directory to a sibling folder under the same WMO directory\n",
"zarr_output_dir = download_dir / f\"WMO_{wmo_id_range}\" / \"zarr\""
"zarr_output_dir = download_dir / \"zarr\""
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "pet_tutorial",
"language": "python",
"name": "python3"
},
Expand All @@ -119,7 +134,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
"version": "3.11.11"
}
},
"nbformat": 4,
Expand Down
36 changes: 8 additions & 28 deletions notebooks/tutorial/HadISD/HadISD_QC_Exploration.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@
"metadata": {},
"outputs": [],
"source": [
"import datetime\n",
"import numpy as np\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"\n",
"import pyearthtools.pipeline as petpipe\n",
"import pyearthtools.data as petdata\n",
Expand All @@ -27,12 +24,13 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"id": "4a7da841",
"metadata": {},
"outputs": [],
"source": [
"# %run HadISD_config.ipynb"
"# ruff: noqa: F821\n",
"%run Pipeline_Config.ipynb"
]
},
{
Expand Down Expand Up @@ -89,17 +87,7 @@
"metadata": {},
"outputs": [],
"source": [
"y = data_prep_pipe[\"1969-01-01T07\"]\n",
"y"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d3819b9",
"metadata": {},
"outputs": [],
"source": [
"x = data_prep_pipe[\"1969-01-01T07\"]\n",
"x"
]
},
Expand All @@ -110,7 +98,7 @@
"metadata": {},
"outputs": [],
"source": [
"qc = y[\"quality_control_flags\"].values\n"
"qc = x[\"quality_control_flags\"].values\n"
]
},
{
Expand Down Expand Up @@ -214,21 +202,13 @@
"# for qc, test 12, time 826, station 0, print the value of the test\n",
"print(\"QC value for test 12, time 826, station 0:\", qc[0, 826, 12])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c8f1bc9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "pyearthtools",
"language": "python",
"name": "python3"
"name": "pyearthtools"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -240,7 +220,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
"version": "3.13.5"
}
},
"nbformat": 4,
Expand Down
20 changes: 14 additions & 6 deletions notebooks/tutorial/HadISD/Pipeline_Config.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "1abab3c3",
"metadata": {},
"outputs": [],
"source": [
"# import pyearthtools.pipeline as petpipe"
"import pyearthtools.pipeline as petpipe"
]
},
{
"cell_type": "markdown",
"id": "2307aa00",
"metadata": {},
"source": [
"# Lists and Dictionaries Required For Sustom Pipeline Steps"
"# Lists and Dictionaries Required For Custom Pipeline Steps"
]
},
{
Expand Down Expand Up @@ -62,6 +62,16 @@
"source": [
"# Custom operation to remove redundent coordinates\n",
"class SqueezeStationCoordinates(petpipe.Operation):\n",
" \"\"\"\n",
" Squeeze singleton dimensions from specified station-based coordinates in an xarray.Dataset.\n",
"\n",
" This operation is useful for removing unnecessary singleton dimensions (e.g., shape (n, 1))\n",
" from coordinates like latitude, longitude, and elevation, ensuring they are 1D and indexed\n",
" by 'station'.\n",
"\n",
" Args:\n",
" coords (tuple of str): Names of coordinates to squeeze. Defaults to (\"latitude\", \"longitude\", \"elevation\").\n",
" \"\"\"\n",
" def __init__(self, coords=(\"latitude\", \"longitude\", \"elevation\")):\n",
" super().__init__()\n",
" self.coords = coords\n",
Expand All @@ -74,7 +84,7 @@
" # Undo function added otherwise pyearthtools will complain\n",
" def undo_func(self, ds):\n",
" # No undo operation needed for this operation\n",
" return ds"
" return ds\n"
]
},
{
Expand Down Expand Up @@ -314,8 +324,6 @@
"metadata": {},
"outputs": [],
"source": [
"import xarray as xr\n",
"import numpy as np\n",
"from pyearthtools.data.transforms.values import AddFlaggedObs\n",
"\n",
"def test_add_flagged_obs():\n",
Expand Down
2 changes: 1 addition & 1 deletion packages/data/src/pyearthtools/data/indexes/_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@

LOG = logging.getLogger("pyearthtools.data")


class Index(CallRedirectMixin, CatalogMixin, metaclass=ABCMeta):
"""
Base Level Index to define the structure
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def get_config(mf: bool = False):
return xr.open_mfdataset(
filter_files(location),
decode_timedelta=True, # TODO: should we raise a warning? It seems to be required for almost all our data.
compat='override',
compat="override",
**get_config(True),
)

Expand Down
17 changes: 13 additions & 4 deletions packages/data/src/pyearthtools/data/transforms/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,20 @@ def apply(self, dataset: xr.Dataset) -> xr.Dataset:
if self._variables is None:
return dataset

var_included = set(dataset.data_vars).difference(set(self._variables))
# 3/9/2025 - old logic was replaced with a simple drop of the variables
# A new issue will be raised to review how coordinate protection should
# work because people need a way to drop coords when needed.

if not var_included:
return dataset
return dataset[var_included]
# Calculate the difference between the data variables on the dataset
# and the variables requested for drop. This leaves coordinate variables
# unaffected
# var_included = set(dataset.data_vars).difference(set(self._variables))

# if not var_included:
# return dataset
# return dataset[var_included]

return dataset.drop_vars(self._variables)


class Select(Transform):
Expand Down
1 change: 1 addition & 0 deletions packages/data/tests/indexes/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from pyearthtools.data.exceptions import DataNotFoundError


def test_Index(monkeypatch):

monkeypatch.setattr("pyearthtools.data.indexes.Index.__abstractmethods__", set())
Expand Down
19 changes: 12 additions & 7 deletions packages/nci_site_archive/tests/test_radar_proj.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,18 @@
import platform
import xarray as xr

from site_archive_nci._Rainfields3 import (
ErrorRadarProj,
ProjErrorStatus,
ProjKind,
RadarProj,
WarnRadarProj,
)
try:
from site_archive_nci._Rainfields3 import (
ErrorRadarProj,
ProjErrorStatus,
ProjKind,
RadarProj,
WarnRadarProj,
)

except ImportError:
pytest.skip(allow_module_level=True)


PYPROJ_SAMPLE = pyproj.Proj("+proj=aea +lat_1=-36 +lat_2=-18 +lon_0=132 +units=m")
EXPECTED_KEYS = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,15 @@ def unjoin(self, sample: Any) -> tuple:


class LatLonInterpolate(Joiner):
'''
"""
Makes additional assumptions about how interpolation should work and
how the data is structured. In this case, interpolation is primarily
expected to occur according to latitude and longitude, performing
expected to occur according to latitude and longitude, performing
no broadcasting, and iterating over other dimensions instead.

It assumed the dimensions 'latitude', 'longitude', 'time', and 'level' will
be present. 'lat' or 'lon' may also be used for convenience.
'''
"""

_override_interface = "Serial"

Expand All @@ -78,15 +78,15 @@ def __init__(
self._merge_kwargs = merge_kwargs

def raise_if_dimensions_wrong(self, dataset):
'''
"""
Raise exceptions if the supplied dataset does not meet requirements
'''
"""

if not hasattr(self, 'required_dims'):
if 'lat' in dataset.coords:
self.required_dims = ['lat', 'lon']
if not hasattr(self, "required_dims"):
if "lat" in dataset.coords:
self.required_dims = ["lat", "lon"]
else:
self.required_dims = ['latitude', 'longitude']
self.required_dims = ["latitude", "longitude"]

present_in_coords = [d in dataset.coords for d in self.required_dims]
if not all(present_in_coords):
Expand All @@ -100,12 +100,12 @@ def raise_if_dimensions_wrong(self, dataset):
# raise ValueError(f"Cannot perform a GeoMergePancake on the data variables {data_var} without {self.required_dims} as a dimension")

def maybe_interp(self, ds):
'''
"""
This method will only interpolate the datasets if the latitudes and longitudes don't already
match. This means, for example, you can't use it to interpolate between time steps
or vertical levels alone. The primary purpose here is lat/lon interpolation, not general
model interpolation or arbitrarily-dimensioned data interpolation.
'''
"""

ds_coords_ok = [ds[coord].equals(self.reference_dataset[coord]) for coord in self.required_dims]

Expand All @@ -115,7 +115,6 @@ def maybe_interp(self, ds):

return ds


def _join_two_datasets(self, sample_a: xr.Dataset, sample_b: xr.Dataset) -> xr.Dataset:
"""
Used to reduce a sequence of joinable items. Only called by the public interface join method.
Expand Down Expand Up @@ -144,7 +143,7 @@ def join(self, sample: tuple[Union[xr.Dataset, xr.DataArray], ...]) -> xr.Datase
return merged

def unjoin(self, sample: Any) -> tuple:
raise NotImplementedError("Not Implemented")
raise NotImplementedError("Not Implemented")


class GeospatialTimeSeriesMerge(Joiner):
Expand Down
Loading