From ee14f2c418d59ce68c7460b564082a2289c4e356 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Mon, 21 Jul 2025 23:23:24 +0100 Subject: [PATCH 01/20] Change get_all_station_ids method to work with new zarr structure --- .../pyearthtools/tutorial/HadisdDataClass.py | 60 +++++-------------- 1 file changed, 15 insertions(+), 45 deletions(-) diff --git a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py index 1fba6cf9..41321c67 100644 --- a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py +++ b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py @@ -136,62 +136,32 @@ def __init__( self.record_initialisation() - # def get_all_station_ids(self, root_directory: Path | str) -> list[str]: - # """ - # Retrieve all station IDs by scanning the dataset directory. - - # Args: - # root_directory (Path | str): The root directory containing station data. - - # Returns: - # list[str]: A list of all station IDs. - # """ - # root_directory = Path(root_directory) - # station_ids = [] - # for folder in cached_iterdir(root_directory): - # if folder.is_dir(): - # for file in cached_iterdir(folder): - # if file.suffix == ".nc": # Check for NetCDF files - # # Extract the station ID from the filename - # station_id = file.stem.split("_")[-1] # Assuming station ID is the last part of the filename - # station_ids.append(station_id) - # return station_ids - def get_all_station_ids(self, root_directory: Path | str = None) -> list[str]: """ - Retrieve all station IDs by scanning the dataset directory. + Retrieve all station IDs by scanning the Zarr directory. Args: - root_directory (Path | str, optional): The root directory containing station data. - Defaults to HADISD_HOME/netcdf. + root_directory (Path | str, optional): The directory containing Zarr files. + Defaults to HADISD_HOME/zarr. Returns: list[str]: A list of all station IDs. """ - HADISD_HOME = self.ROOT_DIRECTORIES["hadisd"] if root_directory is None: - # Search all WMO folders for netcdf subfolders - wmo_folders = [f for f in Path(HADISD_HOME).iterdir() if f.is_dir() and f.name.startswith("WMO_")] - station_ids = [] - for wmo_folder in wmo_folders: - netcdf_dir = wmo_folder / "netcdf" - if cached_exists(netcdf_dir): - for file in cached_iterdir(netcdf_dir): - if file.suffix == ".nc": - station_id = file.stem.split("_")[-1] - station_ids.append(station_id) - return station_ids + zarr_dir = Path(HADISD_HOME) / "zarr" else: - root_directory = Path(root_directory) - if not cached_exists(root_directory): - raise DataNotFoundError(f"Root directory does not exist: {root_directory}") - station_ids = [] - for file in cached_iterdir(root_directory): - if file.suffix == ".nc": - station_id = file.stem.split("_")[-1] - station_ids.append(station_id) - return station_ids + zarr_dir = Path(root_directory) + + if not cached_exists(zarr_dir): + raise DataNotFoundError(f"Zarr directory does not exist: {zarr_dir}") + + station_ids = [] + for file in cached_iterdir(zarr_dir): + if file.suffix == ".zarr": + station_id = file.stem.split("_")[-1] + station_ids.append(station_id) + return station_ids def filesystem(self, *args, date_range=("1970-01-01T00", "2023-12-31T23"), **kwargs) -> dict[str, Path]: """ From ef1fc30590356c0971a0d518df8feb62e6d3b498 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Mon, 21 Jul 2025 23:34:37 +0100 Subject: [PATCH 02/20] Update filesystem method to work with simplified file structure --- .../pyearthtools/tutorial/HadisdDataClass.py | 59 ++----------------- 1 file changed, 5 insertions(+), 54 deletions(-) diff --git a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py index 41321c67..870ee9e3 100644 --- a/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py +++ b/packages/tutorial/src/pyearthtools/tutorial/HadisdDataClass.py @@ -192,66 +192,17 @@ def filesystem(self, *args, date_range=("1970-01-01T00", "2023-12-31T23"), **kwa if not isinstance(station_ids, list) or not all(isinstance(sid, str) for sid in station_ids): raise TypeError(f"Expected station_ids to be a str or list[str], but got: {type(station_ids)}") - # Define the station ranges and corresponding folders - STATION_RANGES = [ - (0, 29999, "WMO_000000-029999"), - (30000, 49999, "WMO_030000-049999"), - (50000, 79999, "WMO_050000-079999"), - (80000, 99999, "WMO_080000-099999"), - (100000, 149999, "WMO_100000-149999"), - (150000, 199999, "WMO_150000-199999"), - (200000, 249999, "WMO_200000-249999"), - (250000, 299999, "WMO_250000-299999"), - (300000, 349999, "WMO_300000-349999"), - (350000, 399999, "WMO_350000-399999"), - (400000, 449999, "WMO_400000-449999"), - (450000, 499999, "WMO_450000-499999"), - (500000, 549999, "WMO_500000-549999"), - (550000, 599999, "WMO_550000-599999"), - (600000, 649999, "WMO_600000-649999"), - (650000, 699999, "WMO_650000-699999"), - (700000, 709999, "WMO_700000-709999"), - (710000, 714999, "WMO_710000-714999"), - (715000, 719999, "WMO_715000-719999"), - (720000, 721999, "WMO_720000-721999"), - (722000, 722999, "WMO_722000-722999"), - (723000, 723999, "WMO_723000-723999"), - (724000, 724999, "WMO_724000-724999"), - (725000, 725999, "WMO_725000-725999"), - (726000, 726999, "WMO_726000-726999"), - (727000, 729999, "WMO_727000-729999"), - (730000, 799999, "WMO_730000-799999"), - (800000, 849999, "WMO_800000-849999"), - (850000, 899999, "WMO_850000-899999"), - (900000, 949999, "WMO_900000-949999"), - (950000, 999999, "WMO_950000-999999"), - ] - # Map station IDs to their file paths paths = {} for station_id in station_ids: - wmo_number = station_id[:6] # Extract the first 6 digits of the station ID - station_numeric = int(wmo_number) # Convert the WMO number to an integer - - # Find the parent folder dynamically - parent_folder = None - for start, end, folder in STATION_RANGES: - if start <= station_numeric <= end: - parent_folder = folder - break - - if parent_folder is None: - raise ValueError(f"Station ID {station_id} does not fall within any defined range.") - - # Construct the expected filename - date_range = "19310101-20240101" # Hardcoded for now; adjust if dataset is updated + date_range_str = "19310101-20240101" # Hardcoded for now; adjust if dataset is updated version = "hadisd.3.4.0.2023f" - filename_nc = f"{version}_{date_range}_{station_id}.nc" - filename_zarr = f"{version}_{date_range}_{station_id}.zarr" + filename_nc = f"{version}_{date_range_str}_{station_id}.nc" + filename_zarr = f"{version}_{date_range_str}_{station_id}.zarr" # Construct the full path - _file_path_nc = Path(HADISD_HOME) / parent_folder / "netcdf" / filename_nc - file_path_zarr = Path(HADISD_HOME) / parent_folder / "zarr" / filename_zarr + file_path_nc = Path(HADISD_HOME) / "netcdf" / filename_nc + file_path_zarr = Path(HADISD_HOME) / "zarr" / filename_zarr # Check if the file exists (comment out if testing with single netcdf) if not file_path_zarr.exists(): From 6693b7cdbe3e178f4ea2bf4a48bda8f371eaa508 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Tue, 22 Jul 2025 09:17:47 +0100 Subject: [PATCH 03/20] Add functionality to loop over station rnges for downloads --- .../tutorial/HadISD/1_HadISD_Download.ipynb | 169 +++++++++--------- 1 file changed, 87 insertions(+), 82 deletions(-) diff --git a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb index 31a7c932..3245e0d8 100644 --- a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb +++ b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "e4fb7b1d", "metadata": {}, "outputs": [], @@ -30,7 +30,8 @@ "from tqdm.auto import tqdm\n", "import tarfile\n", "import gzip\n", - "import shutil" + "import shutil\n", + "from pathlib import Path" ] }, { @@ -93,12 +94,7 @@ "metadata": {}, "outputs": [], "source": [ - "wmo_id_range = wmo_id_range # This has been defined in HadISD_data_config.ipynb\n", - "\n", - "wmo_str = f\"WMO_{wmo_id_range}\"\n", - "url = f\"https://www.metoffice.gov.uk/hadobs/hadisd/v340_2023f/data/{wmo_str}.tar.gz\"\n", - "tar_name = f\"{wmo_str}.tar\"\n", - "filename = download_dir / tar_name" + "wmo_id_range = wmo_id_range # This has been defined in HadISD_data_config.ipynb" ] }, { @@ -108,45 +104,50 @@ "metadata": {}, "outputs": [], "source": [ - "# Get remote file size using HTTP HEAD\n", - "head = requests.head(url, allow_redirects=True)\n", - "remote_size = int(head.headers.get('content-length', 0))\n", + "def download_wmo_range(wmo_id_range, download_dir):\n", + " wmo_str = f\"WMO_{wmo_id_range}\"\n", + " url = f\"https://www.metoffice.gov.uk/hadobs/hadisd/v340_2023f/data/{wmo_str}.tar.gz\"\n", + " tar_name = f\"{wmo_str}.tar\"\n", + " filename = Path(download_dir) / tar_name\n", "\n", - "local_size = filename.stat().st_size if filename.exists() else 0\n", + " head = requests.head(url, allow_redirects=True)\n", + " remote_size = int(head.headers.get('content-length', 0))\n", + " local_size = filename.stat().st_size if filename.exists() else 0\n", "\n", - "if filename.exists() and local_size == remote_size:\n", - " print(f\"File already fully downloaded: {filename} ({local_size/1024**2:.2f} MB)\")\n", - "else:\n", - " headers = {}\n", - " mode = 'wb'\n", - " initial_pos = 0\n", - " if filename.exists() and local_size < remote_size:\n", - " headers['Range'] = f'bytes={local_size}-'\n", - " mode = 'ab'\n", - " initial_pos = local_size\n", - " print(f\"Resuming download for {filename.name} at {local_size/1024**2:.2f} MB...\")\n", + " if filename.exists() and local_size == remote_size:\n", + " print(f\"File already fully downloaded: {filename} ({local_size/1024**2:.2f} MB)\")\n", " else:\n", - " print(f\"Starting download for {filename.name}...\")\n", + " headers = {}\n", + " mode = 'wb'\n", + " initial_pos = 0\n", + " if filename.exists() and local_size < remote_size:\n", + " headers['Range'] = f'bytes={local_size}-'\n", + " mode = 'ab'\n", + " initial_pos = local_size\n", + " print(f\"Resuming download for {filename.name} at {local_size/1024**2:.2f} MB...\")\n", + " else:\n", + " print(f\"Starting download for {filename.name}...\")\n", "\n", - " response = requests.get(url, stream=True, headers=headers)\n", - " total = remote_size\n", + " response = requests.get(url, stream=True, headers=headers)\n", + " total = remote_size\n", + " with open(filename, mode) as f, tqdm(\n", + " desc=f\"Downloading {filename.name}\",\n", + " total=total,\n", + " initial=initial_pos,\n", + " unit='B', unit_scale=True, unit_divisor=1024\n", + " ) as bar:\n", + " for chunk in response.iter_content(chunk_size=8192):\n", + " if chunk:\n", + " f.write(chunk)\n", + " bar.update(len(chunk))\n", "\n", - " with open(filename, mode) as f, tqdm(\n", - " desc=f\"Downloading {filename.name}\",\n", - " total=total,\n", - " initial=initial_pos,\n", - " unit='B', unit_scale=True, unit_divisor=1024\n", - " ) as bar:\n", - " for chunk in response.iter_content(chunk_size=8192):\n", - " if chunk:\n", - " f.write(chunk)\n", - " bar.update(len(chunk))\n", + " final_size = filename.stat().st_size\n", + " if final_size == remote_size:\n", + " print(f\"Download complete: {filename} ({final_size/1024**2:.2f} MB)\")\n", + " else:\n", + " print(f\"Warning: Download incomplete. Local size: {final_size}, Remote size: {remote_size}\")\n", "\n", - " final_size = filename.stat().st_size\n", - " if final_size == remote_size:\n", - " print(f\"Download complete: {filename} ({final_size/1024**2:.2f} MB)\")\n", - " else:\n", - " print(f\"Warning: Download incomplete. Local size: {final_size}, Remote size: {remote_size}\")\n", + " return filename, tar_name\n", "\n", "# Possibly also add check to see if netcdf files esist for the downloaded tar file, if so then don't download again" ] @@ -166,66 +167,70 @@ "metadata": {}, "outputs": [], "source": [ - "extract_dir = download_dir / tar_name.replace('.tar', '')\n", - "extract_dir.mkdir(exist_ok=True)\n", - "\n", - "extracted_files = list(extract_dir.glob('*'))\n", - "if extracted_files:\n", - " print(f\"Extraction directory '{extract_dir}' already contains {len(extracted_files)} files. Skipping extraction.\")\n", - "elif filename.exists():\n", - " with tarfile.open(filename, \"r:gz\") as tar:\n", - " tar.extractall(path=extract_dir)\n", + "def extract_wmo_tar(filename, tar_name, download_dir):\n", + " extract_dir = Path(download_dir) / tar_name.replace('.tar', '')\n", + " extract_dir.mkdir(exist_ok=True)\n", " extracted_files = list(extract_dir.glob('*'))\n", " if extracted_files:\n", - " print(f\"Extraction successful. {len(extracted_files)} files found in {extract_dir}.\")\n", - " # Delete the tar file after extraction\n", - " filename.unlink()\n", - " print(f\"Deleted tar file: {filename}\")\n", + " print(f\"Extraction directory '{extract_dir}' already contains {len(extracted_files)} files. Skipping extraction.\")\n", + " elif filename.exists():\n", + " with tarfile.open(filename, \"r:gz\") as tar:\n", + " tar.extractall(path=extract_dir)\n", + " extracted_files = list(extract_dir.glob('*'))\n", + " if extracted_files:\n", + " print(f\"Extraction successful. {len(extracted_files)} files found in {extract_dir}.\")\n", + " filename.unlink()\n", + " print(f\"Deleted tar file: {filename}\")\n", + " else:\n", + " print(f\"Warning: No files extracted to {extract_dir}. Tar file will not be deleted.\")\n", + " raise RuntimeError(\"Extraction failed, tar file not deleted.\")\n", " else:\n", - " print(f\"Warning: No files extracted to {extract_dir}. Tar file will not be deleted.\")\n", - " raise RuntimeError(\"Extraction failed, tar file not deleted.\")\n", - "else:\n", - " print(f\"No tar file found and extraction directory is empty. Nothing to extract.\")\n", - " raise FileNotFoundError(f\"Missing tar file: {filename}\")\n" + " print(f\"No tar file found and extraction directory is empty. Nothing to extract.\")\n", + " raise FileNotFoundError(f\"Missing tar file: {filename}\")\n", + " return extract_dir" ] }, { "cell_type": "code", "execution_count": null, - "id": "53161550", + "id": "4e43dcc4", "metadata": {}, "outputs": [], "source": [ - "# Create subfolder for netcdf\n", - "netcdf_dir = download_dir / \"netcdf\"\n", - "netcdf_dir.mkdir(parents=True, exist_ok=True)" + "# Move extracted .nc files into netcdf_dir after extraction\n", + "def move_netcdf_files(extract_dir, download_dir):\n", + " netcdf_dir = Path(download_dir) / \"netcdf\"\n", + " netcdf_dir.mkdir(parents=True, exist_ok=True)\n", + " num_files = 0\n", + " for gz_path in extract_dir.glob('*.nc.gz'):\n", + " nc_path = gz_path.with_suffix('') # Remove .gz extension\n", + " with gzip.open(gz_path, 'rb') as f_in, open(nc_path, 'wb') as f_out:\n", + " f_out.write(f_in.read())\n", + " gz_path.unlink()\n", + " shutil.move(str(nc_path), netcdf_dir / nc_path.name)\n", + " num_files += 1\n", + " print(f\"{num_files} .nc files have been extracted, cleaned up, and moved to the netcdf directory: {netcdf_dir}\")\n", + "\n", + " # Delete extraction directory\n", + " try:\n", + " shutil.rmtree(extract_dir)\n", + " print(f\"Deleted extraction directory: {extract_dir}\")\n", + " except Exception as e:\n", + " print(f\"Could not delete extraction directory {extract_dir}: {e}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "4e43dcc4", + "id": "268fa3b4", "metadata": {}, "outputs": [], "source": [ - "# Move extracted .nc files into netcdf_dir after extraction\n", - "num_files = 0\n", - "for gz_path in extract_dir.glob('*.nc.gz'):\n", - " nc_path = gz_path.with_suffix('') # Remove .gz extension\n", - " with gzip.open(gz_path, 'rb') as f_in, open(nc_path, 'wb') as f_out:\n", - " f_out.write(f_in.read())\n", - " gz_path.unlink() # Delete the .gz file after extraction\n", - " shutil.move(str(nc_path), netcdf_dir / nc_path.name)\n", - " num_files += 1\n", - "\n", - "print(f\"{num_files} .nc files have been extracted, cleaned up, and moved to the netcdf directory: {netcdf_dir}\")\n", - "\n", - "# Delete the extraction directory after processing\n", - "try:\n", - " shutil.rmtree(extract_dir)\n", - " print(f\"Deleted extraction directory: {extract_dir}\")\n", - "except Exception as e:\n", - " print(f\"Could not delete extraction directory {extract_dir}: {e}\")" + "wmo_ranges = [\"080000-099999\", \"100000-119999\"] # Add your desired ranges here\n", + "for wmo_id_range in wmo_ranges:\n", + " filename, tar_name = download_wmo_range(wmo_id_range, download_dir)\n", + " extract_dir = extract_wmo_tar(filename, tar_name, download_dir)\n", + " move_netcdf_files(extract_dir, download_dir)" ] } ], From d50c0d2637803a4e29e87e8d07b49e6d27463fa7 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Tue, 22 Jul 2025 17:05:06 +0100 Subject: [PATCH 04/20] Make downloading and extraction even more idempotent. Final cell of downloads can be run as many times as we like --- .../tutorial/HadISD/1_HadISD_Download.ipynb | 94 ++++++++++++++++--- .../tutorial/HadISD/2_HadISD_to_zarr.ipynb | 8 +- notebooks/tutorial/HadISD/Data_Config.ipynb | 57 ++++++----- 3 files changed, 123 insertions(+), 36 deletions(-) diff --git a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb index 3245e0d8..802a0ee1 100644 --- a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb +++ b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb @@ -84,7 +84,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"Downloading HadISD data for WMO range: {wmo_id_range}\")" + "print(f\"Downloading HadISD data for WMO range: {wmo_id_ranges}\")" ] }, { @@ -94,7 +94,7 @@ "metadata": {}, "outputs": [], "source": [ - "wmo_id_range = wmo_id_range # This has been defined in HadISD_data_config.ipynb" + "wmo_id_ranges = wmo_id_ranges # This has been defined in HadISD_data_config.ipynb" ] }, { @@ -147,9 +147,7 @@ " else:\n", " print(f\"Warning: Download incomplete. Local size: {final_size}, Remote size: {remote_size}\")\n", "\n", - " return filename, tar_name\n", - "\n", - "# Possibly also add check to see if netcdf files esist for the downloaded tar file, if so then don't download again" + " return filename, tar_name\n" ] }, { @@ -219,15 +217,89 @@ " print(f\"Could not delete extraction directory {extract_dir}: {e}\")" ] }, + { + "cell_type": "markdown", + "id": "932e8906", + "metadata": {}, + "source": [ + "### Idempotent Checks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcb9b902", + "metadata": {}, + "outputs": [], + "source": [ + "def netcdf_files_exist_for_range(wmo_id_range, netcdf_dir):\n", + " \"\"\"Check if any .nc files for the given WMO range exist in the netcdf directory.\"\"\"\n", + " start, end = map(int, wmo_id_range.split('-'))\n", + " nc_files = list(Path(netcdf_dir).glob(\"*.nc\"))\n", + " for nc_file in nc_files:\n", + " try:\n", + " # Extract the first 6 digits from the station part of the filename\n", + " station_part = nc_file.stem.split('_')[-1]\n", + " wmo_number = int(station_part.split('-')[0])\n", + " if start <= wmo_number <= end:\n", + " return True\n", + " except Exception as e:\n", + " print(f\"Skipping file {nc_file.name}: {e}\")\n", + " continue\n", + " print(f\"No NetCDF files found for WMO range {wmo_id_range}.\")\n", + " return False" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "268fa3b4", + "id": "36ad9920", "metadata": {}, "outputs": [], "source": [ - "wmo_ranges = [\"080000-099999\", \"100000-119999\"] # Add your desired ranges here\n", - "for wmo_id_range in wmo_ranges:\n", + "def is_tar_fully_downloaded(wmo_id_range, download_dir):\n", + " \"\"\"Check if the tar file exists and is fully downloaded (size matches remote).\"\"\"\n", + " wmo_str = f\"WMO_{wmo_id_range}\"\n", + " tar_name = f\"{wmo_str}.tar\"\n", + " tar_path = Path(download_dir) / tar_name\n", + " url = f\"https://www.metoffice.gov.uk/hadobs/hadisd/v340_2023f/data/{wmo_str}.tar.gz\"\n", + "\n", + " if not tar_path.exists():\n", + " return False\n", + "\n", + " # Get remote file size\n", + " head = requests.head(url, allow_redirects=True)\n", + " remote_size = int(head.headers.get('content-length', 0))\n", + " local_size = tar_path.stat().st_size\n", + "\n", + " return local_size == remote_size" + ] + }, + { + "cell_type": "markdown", + "id": "77f044b9", + "metadata": {}, + "source": [ + "### Loop through each WMO range, download if necessary, extract, and move files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffcc5730", + "metadata": {}, + "outputs": [], + "source": [ + "netcdf_dir = Path(download_dir) / \"netcdf\"\n", + "for wmo_id_range in wmo_id_ranges:\n", + " if is_tar_fully_downloaded(wmo_id_range, download_dir):\n", + " print(f\"Tar file for {wmo_id_range} is fully downloaded. Skipping download.\")\n", + " continue\n", + "\n", + " if netcdf_files_exist_for_range(wmo_id_range, netcdf_dir):\n", + " print(f\"NetCDF files for {wmo_id_range} already exist. Skipping download and extraction.\")\n", + " continue\n", + "\n", " filename, tar_name = download_wmo_range(wmo_id_range, download_dir)\n", " extract_dir = extract_wmo_tar(filename, tar_name, download_dir)\n", " move_netcdf_files(extract_dir, download_dir)" @@ -236,9 +308,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "pyearthtools", "language": "python", - "name": "python3" + "name": "pyearthtools" }, "language_info": { "codemirror_mode": { @@ -250,7 +322,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.2" + "version": "3.13.5" } }, "nbformat": 4, diff --git a/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb b/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb index 03e92049..92b64097 100644 --- a/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb +++ b/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "55a84e6e", "metadata": {}, "outputs": [], @@ -298,9 +298,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "pyearthtools", "language": "python", - "name": "python3" + "name": "pyearthtools" }, "language_info": { "codemirror_mode": { @@ -312,7 +312,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.2" + "version": "3.13.5" } }, "nbformat": 4, diff --git a/notebooks/tutorial/HadISD/Data_Config.ipynb b/notebooks/tutorial/HadISD/Data_Config.ipynb index db28f31b..05af1865 100644 --- a/notebooks/tutorial/HadISD/Data_Config.ipynb +++ b/notebooks/tutorial/HadISD/Data_Config.ipynb @@ -57,27 +57,42 @@ "metadata": {}, "outputs": [], "source": [ - "# A sample list of WMO number ranges. Users can find more at the official HadISD download page.\n", + "# For any station ranges you don't want to download, you can comment them out here\n", "wmo_id_ranges = [\n", - " \"000000-029999\",\n", - " \"080000-099999\",\n", - " \"200000-249999\",\n", - " \"720000-721999\",\n", + " #\"000000-029999\",\n", + " #\"030000-049999\",\n", + " #\"050000-079999\",\n", + " \"080000-099999\",\n", + " # \"100000-149999\",\n", + " # \"150000-199999\",\n", + " # \"200000-249999\",\n", + " # \"250000-299999\",\n", + " # \"300000-349999\",\n", + " # \"350000-399999\",\n", + " # \"400000-449999\",\n", + " # \"450000-499999\",\n", + " # \"500000-549999\",\n", + " # \"550000-599999\",\n", + " # \"600000-649999\",\n", + " # \"650000-699999\",\n", + " # \"700000-709999\",\n", + " # \"710000-714999\",\n", + " # \"715000-719999\",\n", + " \"720000-721999\",\n", + " # \"722000-722999\",\n", + " # \"723000-723999\",\n", + " # \"724000-724999\",\n", + " # \"725000-725999\",\n", + " # \"726000-726999\",\n", + " # \"727000-729999\",\n", + " # \"730000-799999\",\n", + " \"800000-849999\",\n", + " # \"850000-899999\",\n", + " # \"900000-949999\",\n", + " # \"950000-999999\",\n", "]" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "35617ad2", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# User sets the WMO number range to download\n", - "wmo_id_range = \"080000-099999\" # Change this to the desired WMO range, either from the sample list or from the HadISD page." - ] - }, { "cell_type": "markdown", "id": "7aec321a", @@ -97,15 +112,15 @@ "# Set the date range to reindex the time coordinate\n", "DATE_RANGE = (\"1970-01-01T00\", \"2023-12-31T23\")\n", "# Set the input directory to the folder with raw NetCDFs\n", - "input_dir = download_dir / f\"WMO_{wmo_id_range}\" / \"netcdf\"\n", + "input_dir = download_dir / \"netcdf\"\n", "# Set the Zarr output directory to a sibling folder under the same WMO directory\n", - "zarr_output_dir = download_dir / f\"WMO_{wmo_id_range}\" / \"zarr\"" + "zarr_output_dir = download_dir / \"zarr\"" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "pet_tutorial", "language": "python", "name": "python3" }, @@ -119,7 +134,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.2" + "version": "3.11.11" } }, "nbformat": 4, From 44b819c41eb7e27884e7bc214e5c409adeadc726 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Wed, 23 Jul 2025 14:48:52 +0100 Subject: [PATCH 05/20] Improve conversion efficency --- .../tutorial/HadISD/2_HadISD_to_zarr.ipynb | 87 +++++++++---------- 1 file changed, 41 insertions(+), 46 deletions(-) diff --git a/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb b/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb index 92b64097..f02b3b2a 100644 --- a/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb +++ b/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb @@ -10,7 +10,10 @@ "import xarray as xr\n", "import pandas as pd\n", "from pathlib import Path\n", - "from dask import delayed, compute" + "from dask import delayed, compute\n", + "import multiprocessing\n", + "import os\n", + "import warnings\n" ] }, { @@ -114,8 +117,8 @@ "metadata": {}, "outputs": [], "source": [ - "def process_all_to_zarr(netcdf_dir, zarr_output_dir, date_range):\n", - " \"\"\"Convert NetCDF files to Zarr only if not already present in the Zarr directory.\"\"\"\n", + "def process_all_to_zarr(netcdf_dir, zarr_output_dir, date_range, scheduler='threads', num_workers=None):\n", + " \"\"\"Parallel NetCDF to Zarr conversion using dask.delayed. Returns both success boolean and status message.\"\"\"\n", " netcdf_dir = Path(netcdf_dir)\n", " zarr_output_dir = Path(zarr_output_dir)\n", " zarr_output_dir.mkdir(parents=True, exist_ok=True)\n", @@ -123,61 +126,38 @@ " netcdf_files = list(netcdf_dir.glob(\"*.nc\"))\n", " zarr_files = set(f.stem for f in zarr_output_dir.glob(\"*.zarr\"))\n", "\n", - " converted = 0\n", + " tasks = []\n", " skipped = 0\n", "\n", - " for nc_file in netcdf_files:\n", - " zarr_name = nc_file.stem\n", - " out_path = zarr_output_dir / f\"{zarr_name}.zarr\"\n", - " if zarr_name in zarr_files:\n", - " print(f\"Zarr file already exists for {nc_file.name}: {out_path.name}. Skipping.\")\n", - " skipped += 1\n", - " continue\n", - " print(f\"Converting: {nc_file.name} → {out_path.name}\")\n", + " def convert_netcdf_to_zarr(nc_file, out_path, date_range):\n", " try:\n", " ds = preprocess_station(nc_file, date_range)\n", " ds.to_zarr(str(out_path), mode='w')\n", - " converted += 1\n", + " return (True, f\"Converted: {nc_file.name} → {out_path.name}\")\n", " except Exception as e:\n", - " print(f\"Failed on {nc_file.name}: {e}\")\n", - "\n", - " print(f\"Conversion complete. {converted} new stations converted, {skipped} already present.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6c2c9987", - "metadata": {}, - "outputs": [], - "source": [ - "def process_all_to_zarr(netcdf_dir, zarr_output_dir, date_range):\n", - " \"\"\"Convert NetCDF files to Zarr only if not already present in the Zarr directory.\"\"\"\n", - " netcdf_dir = Path(netcdf_dir)\n", - " zarr_output_dir = Path(zarr_output_dir)\n", - " zarr_output_dir.mkdir(parents=True, exist_ok=True)\n", - "\n", - " netcdf_files = list(netcdf_dir.glob(\"*.nc\"))\n", - " zarr_files = set(f.stem for f in zarr_output_dir.glob(\"*.zarr\"))\n", - "\n", - " converted = 0\n", - " skipped = 0\n", + " return (False, f\"Failed on {nc_file.name}: {e}\")\n", "\n", " for nc_file in netcdf_files:\n", " zarr_name = nc_file.stem\n", " out_path = zarr_output_dir / f\"{zarr_name}.zarr\"\n", " if zarr_name in zarr_files:\n", - " print(f\"Zarr file already exists for {nc_file.name}: {out_path.name}. Skipping.\")\n", + " # print(f\"Zarr file already exists for {nc_file.name}: {out_path.name}. Skipping.\")\n", " skipped += 1\n", " continue\n", - " print(f\"Converting: {nc_file.name} → {out_path.name}\")\n", - " try:\n", - " ds = preprocess_station(nc_file, date_range)\n", - " ds.to_zarr(str(out_path), mode='w')\n", - " converted += 1\n", - " except Exception as e:\n", - " print(f\"Failed on {nc_file.name}: {e}\")\n", + " tasks.append(delayed(convert_netcdf_to_zarr)(nc_file, out_path, date_range))\n", + "\n", + " if not tasks:\n", + " print(f\"No new NetCDF files to convert. {skipped} already present.\")\n", + " return\n", "\n", + " if num_workers is None:\n", + " num_workers = multiprocessing.cpu_count() // 2\n", + "\n", + " print(f\"Starting Dask parallel conversion with {num_workers} workers...\")\n", + " results = compute(*tasks, scheduler=scheduler, num_workers=num_workers)\n", + " for success, msg in results:\n", + " print(msg)\n", + " converted = sum(success for success, _ in results)\n", " print(f\"Conversion complete. {converted} new stations converted, {skipped} already present.\")" ] }, @@ -254,11 +234,26 @@ { "cell_type": "code", "execution_count": null, - "id": "57101a08", + "id": "f5b48e73", + "metadata": {}, + "outputs": [], + "source": [ + "# Supress conversion to zarr warnings\n", + "os.environ[\"PYTHONWARNINGS\"] = \"ignore::UserWarning\"\n", + "warnings.filterwarnings(\"ignore\", message=\".*not part in the Zarr format 3 specification.*\")\n", + "warnings.filterwarnings(\"ignore\", message=\".*vlen-utf8.*\")\n", + "warnings.filterwarnings(\"ignore\", message=\".*dtype Date: Mon, 28 Jul 2025 22:52:43 +0100 Subject: [PATCH 06/20] Tidy station list --- notebooks/tutorial/HadISD/Data_Config.ipynb | 56 ++++++++++----------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/notebooks/tutorial/HadISD/Data_Config.ipynb b/notebooks/tutorial/HadISD/Data_Config.ipynb index 05af1865..ad00c678 100644 --- a/notebooks/tutorial/HadISD/Data_Config.ipynb +++ b/notebooks/tutorial/HadISD/Data_Config.ipynb @@ -62,34 +62,34 @@ " #\"000000-029999\",\n", " #\"030000-049999\",\n", " #\"050000-079999\",\n", - " \"080000-099999\",\n", - " # \"100000-149999\",\n", - " # \"150000-199999\",\n", - " # \"200000-249999\",\n", - " # \"250000-299999\",\n", - " # \"300000-349999\",\n", - " # \"350000-399999\",\n", - " # \"400000-449999\",\n", - " # \"450000-499999\",\n", - " # \"500000-549999\",\n", - " # \"550000-599999\",\n", - " # \"600000-649999\",\n", - " # \"650000-699999\",\n", - " # \"700000-709999\",\n", - " # \"710000-714999\",\n", - " # \"715000-719999\",\n", - " \"720000-721999\",\n", - " # \"722000-722999\",\n", - " # \"723000-723999\",\n", - " # \"724000-724999\",\n", - " # \"725000-725999\",\n", - " # \"726000-726999\",\n", - " # \"727000-729999\",\n", - " # \"730000-799999\",\n", - " \"800000-849999\",\n", - " # \"850000-899999\",\n", - " # \"900000-949999\",\n", - " # \"950000-999999\",\n", + " #\"080000-099999\",\n", + " #\"100000-149999\",\n", + " #\"150000-199999\",\n", + " #\"200000-249999\",\n", + " #\"250000-299999\",\n", + " #\"300000-349999\",\n", + " #\"350000-399999\",\n", + " #\"400000-449999\",\n", + " #\"450000-499999\",\n", + " \"500000-549999\",\n", + " #\"550000-599999\",\n", + " #\"600000-649999\",\n", + " #\"650000-699999\",\n", + " #\"700000-709999\",\n", + " #\"710000-714999\",\n", + " #\"715000-719999\",\n", + " #\"720000-721999\",\n", + " \"722000-722999\",\n", + " #\"723000-723999\",\n", + " #\"724000-724999\",\n", + " #\"725000-725999\",\n", + " #\"726000-726999\",\n", + " #\"727000-729999\",\n", + " #\"730000-799999\",\n", + " \"800000-849999\",\n", + " #\"850000-899999\",\n", + " #\"900000-949999\",\n", + " #\"950000-999999\",\n", "]" ] }, From d894f7f6d6990418196f7eb2874f315749613adc Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Mon, 28 Jul 2025 23:00:29 +0100 Subject: [PATCH 07/20] Improve notebook descriptions --- .../tutorial/HadISD/1_HadISD_Download.ipynb | 20 ++++++------------- .../tutorial/HadISD/2_HadISD_to_zarr.ipynb | 2 +- .../HadISD/3_HadISD_XGBoost_Pipeline.ipynb | 4 ++-- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb index 802a0ee1..53d3450c 100644 --- a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb +++ b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb @@ -61,20 +61,22 @@ "metadata": {}, "source": [ "### Download HadISD Data\n", - "The following code will download the HadISD data files. Some files take longer to download than others depending on time of day. To download different WMO datasets, you can change `wmo_id_range` in the `Data_Config.ipynb` notebook .\n", + "The following code will download the HadISD data files. Some files take longer to download than others depending on time of day. To download different WMO datasets, you can change `wmo_id_ranges` in the `Data_Config.ipynb` notebook.\n", "\n", "The full list of available data can be found here:\n", - "https://www.metoffice.gov.uk/hadobs/hadisd/v340_2023f/download.html" + "https://www.metoffice.gov.uk/hadobs/hadisd/v340_2023f/download.html\n", + "\n", + "Station data has been split up into ranges to make downloads more managable. You may download as much or as little as you like. To get started we reccomend just downloading a few station ranges to get an idea of how to use HadISD data with PyEarthTools. " ] }, { "cell_type": "code", "execution_count": null, - "id": "feb8d671", + "id": "8ddbebda", "metadata": {}, "outputs": [], "source": [ - "# Explain why stations are split into ranges, file size, and how it's not neccesssary to download all stations. " + "wmo_id_ranges = wmo_id_ranges # This has been defined in HadISD_data_config.ipynb" ] }, { @@ -87,16 +89,6 @@ "print(f\"Downloading HadISD data for WMO range: {wmo_id_ranges}\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ddbebda", - "metadata": {}, - "outputs": [], - "source": [ - "wmo_id_ranges = wmo_id_ranges # This has been defined in HadISD_data_config.ipynb" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb b/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb index f02b3b2a..6c9a2099 100644 --- a/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb +++ b/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb @@ -253,7 +253,7 @@ "outputs": [], "source": [ "# Run parallel NetCDF-to-Zarr conversion with safe defaults\n", - "process_all_to_zarr(str(input_dir), str(zarr_output_dir), DATE_RANGE, scheduler='processes', num_workers=3)" + "process_all_to_zarr(str(input_dir), str(zarr_output_dir), DATE_RANGE, scheduler='processes')" ] }, { diff --git a/notebooks/tutorial/HadISD/3_HadISD_XGBoost_Pipeline.ipynb b/notebooks/tutorial/HadISD/3_HadISD_XGBoost_Pipeline.ipynb index 8d2cf388..12aa3c4e 100644 --- a/notebooks/tutorial/HadISD/3_HadISD_XGBoost_Pipeline.ipynb +++ b/notebooks/tutorial/HadISD/3_HadISD_XGBoost_Pipeline.ipynb @@ -241,7 +241,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "pet_tutorial", "language": "python", "name": "python3" }, @@ -255,7 +255,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.2" + "version": "3.11.11" } }, "nbformat": 4, From 026a6d8d67db47ca327111b04de3370487deae99 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Mon, 11 Aug 2025 14:23:34 +0100 Subject: [PATCH 08/20] Ruff compliance added --- .../tutorial/HadISD/1_HadISD_Download.ipynb | 5 +-- .../tutorial/HadISD/2_HadISD_to_zarr.ipynb | 3 +- .../HadISD/3_HadISD_XGBoost_Pipeline.ipynb | 27 ++++++++------ .../HadISD/HadISD_QC_Exploration.ipynb | 36 +++++-------------- .../tutorial/HadISD/Pipeline_Config.ipynb | 6 ++-- 5 files changed, 31 insertions(+), 46 deletions(-) diff --git a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb index 53d3450c..fceb40eb 100644 --- a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb +++ b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb @@ -51,6 +51,7 @@ "metadata": {}, "outputs": [], "source": [ + "# ruff: noqa: F821\n", "%run Data_Config.ipynb\n", "print(f\"Data will be downloaded to: {download_dir}\") " ] @@ -175,7 +176,7 @@ " print(f\"Warning: No files extracted to {extract_dir}. Tar file will not be deleted.\")\n", " raise RuntimeError(\"Extraction failed, tar file not deleted.\")\n", " else:\n", - " print(f\"No tar file found and extraction directory is empty. Nothing to extract.\")\n", + " print(\"No tar file found and extraction directory is empty. Nothing to extract.\")\n", " raise FileNotFoundError(f\"Missing tar file: {filename}\")\n", " return extract_dir" ] @@ -302,7 +303,7 @@ "kernelspec": { "display_name": "pyearthtools", "language": "python", - "name": "pyearthtools" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb b/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb index 6c9a2099..42c05837 100644 --- a/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb +++ b/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb @@ -13,7 +13,7 @@ "from dask import delayed, compute\n", "import multiprocessing\n", "import os\n", - "import warnings\n" + "import warnings" ] }, { @@ -225,6 +225,7 @@ "metadata": {}, "outputs": [], "source": [ + "# ruff: noqa: F821\n", "%run Data_Config.ipynb\n", "print(f\"NetCDF input directory: {input_dir}\")\n", "print(f\"Zarr output directory: {zarr_output_dir}\")\n", diff --git a/notebooks/tutorial/HadISD/3_HadISD_XGBoost_Pipeline.ipynb b/notebooks/tutorial/HadISD/3_HadISD_XGBoost_Pipeline.ipynb index 12aa3c4e..ed47906a 100644 --- a/notebooks/tutorial/HadISD/3_HadISD_XGBoost_Pipeline.ipynb +++ b/notebooks/tutorial/HadISD/3_HadISD_XGBoost_Pipeline.ipynb @@ -14,13 +14,15 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", - "import pandas as pd\n", - "from pathlib import Path\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from xgboost import XGBClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", "\n", "import pyearthtools.pipeline as petpipe\n", "import pyearthtools.data as petdata\n", @@ -40,6 +42,7 @@ "metadata": {}, "outputs": [], "source": [ + "# ruff: noqa: F821\n", "%run Pipeline_Config.ipynb" ] }, @@ -71,7 +74,7 @@ "source": [ "# Select first n stations\n", "first_ten_stations = all_stations_ordered[:10]\n", - "print(f\"List of first ten stations:\", first_ten_stations)" + "print(\"List of first ten stations:\", first_ten_stations)" ] }, { @@ -182,8 +185,6 @@ "metadata": {}, "outputs": [], "source": [ - "from xgboost import XGBClassifier\n", - "\n", "# Calculate scale_pos_weight for class imbalance\n", "scale_pos_weight = (len(y_train) - np.sum(y_train)) / np.sum(y_train)\n", "#scale_pos_weight = num_zeros / num_ones \n", @@ -218,13 +219,17 @@ "outputs": [], "source": [ "# compare predictions with true labels\n", - "from sklearn.metrics import classification_report, confusion_matrix\n", "print(classification_report(y_test, y_pred))\n", - "print(confusion_matrix(y_test, y_pred)) \n", - "\n", + "print(confusion_matrix(y_test, y_pred)) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# Plot confusion matrix\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", "def plot_confusion_matrix(cm, classes, title='Confusion Matrix', cmap=plt.cm.Blues):\n", " plt.figure(figsize=(8, 6))\n", " sns.heatmap(cm, annot=True, fmt='d', cmap=cmap,\n", diff --git a/notebooks/tutorial/HadISD/HadISD_QC_Exploration.ipynb b/notebooks/tutorial/HadISD/HadISD_QC_Exploration.ipynb index c26523fa..0b3694cf 100644 --- a/notebooks/tutorial/HadISD/HadISD_QC_Exploration.ipynb +++ b/notebooks/tutorial/HadISD/HadISD_QC_Exploration.ipynb @@ -15,10 +15,7 @@ "metadata": {}, "outputs": [], "source": [ - "import datetime\n", "import numpy as np\n", - "import pandas as pd\n", - "from pathlib import Path\n", "\n", "import pyearthtools.pipeline as petpipe\n", "import pyearthtools.data as petdata\n", @@ -27,12 +24,13 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "4a7da841", "metadata": {}, "outputs": [], "source": [ - "# %run HadISD_config.ipynb" + "# ruff: noqa: F821\n", + "%run Pipeline_Config.ipynb" ] }, { @@ -89,17 +87,7 @@ "metadata": {}, "outputs": [], "source": [ - "y = data_prep_pipe[\"1969-01-01T07\"]\n", - "y" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3d3819b9", - "metadata": {}, - "outputs": [], - "source": [ + "x = data_prep_pipe[\"1969-01-01T07\"]\n", "x" ] }, @@ -110,7 +98,7 @@ "metadata": {}, "outputs": [], "source": [ - "qc = y[\"quality_control_flags\"].values\n" + "qc = x[\"quality_control_flags\"].values\n" ] }, { @@ -214,21 +202,13 @@ "# for qc, test 12, time 826, station 0, print the value of the test\n", "print(\"QC value for test 12, time 826, station 0:\", qc[0, 826, 12])" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c8f1bc9", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "pyearthtools", "language": "python", - "name": "python3" + "name": "pyearthtools" }, "language_info": { "codemirror_mode": { @@ -240,7 +220,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.2" + "version": "3.13.5" } }, "nbformat": 4, diff --git a/notebooks/tutorial/HadISD/Pipeline_Config.ipynb b/notebooks/tutorial/HadISD/Pipeline_Config.ipynb index e0dcc486..71b05a54 100644 --- a/notebooks/tutorial/HadISD/Pipeline_Config.ipynb +++ b/notebooks/tutorial/HadISD/Pipeline_Config.ipynb @@ -2,12 +2,12 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "1abab3c3", "metadata": {}, "outputs": [], "source": [ - "# import pyearthtools.pipeline as petpipe" + "import pyearthtools.pipeline as petpipe" ] }, { @@ -314,8 +314,6 @@ "metadata": {}, "outputs": [], "source": [ - "import xarray as xr\n", - "import numpy as np\n", "from pyearthtools.data.transforms.values import AddFlaggedObs\n", "\n", "def test_add_flagged_obs():\n", From 05e032121c2ec2153786758faf83c1c368350adb Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Fri, 29 Aug 2025 17:25:57 +0100 Subject: [PATCH 09/20] Update Drop transform for more robust handling --- .../data/src/pyearthtools/data/transforms/variables.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/packages/data/src/pyearthtools/data/transforms/variables.py b/packages/data/src/pyearthtools/data/transforms/variables.py index 93d0117e..22ba38a1 100644 --- a/packages/data/src/pyearthtools/data/transforms/variables.py +++ b/packages/data/src/pyearthtools/data/transforms/variables.py @@ -86,12 +86,7 @@ def __init__(self, variables: list[str] | str, *extra_variables): def apply(self, dataset: xr.Dataset) -> xr.Dataset: if self._variables is None: return dataset - - var_included = set(dataset.data_vars).difference(set(self._variables)) - - if not var_included: - return dataset - return dataset[var_included] + return dataset.drop_vars(self._variables) class Select(Transform): From f1ae57f7904e416ee2bfe4226e85eb1aae268360 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Fri, 29 Aug 2025 17:26:48 +0100 Subject: [PATCH 10/20] Update download notebook to show good output --- .../tutorial/HadISD/1_HadISD_Download.ipynb | 87 ++++++++++++++++--- 1 file changed, 73 insertions(+), 14 deletions(-) diff --git a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb index fceb40eb..71a8b0b3 100644 --- a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb +++ b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "e4fb7b1d", "metadata": {}, "outputs": [], @@ -46,14 +46,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "2eadaf27", "metadata": {}, "outputs": [], "source": [ "# ruff: noqa: F821\n", - "%run Data_Config.ipynb\n", - "print(f\"Data will be downloaded to: {download_dir}\") " + "%run Data_Config.ipynb" ] }, { @@ -72,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "8ddbebda", "metadata": {}, "outputs": [], @@ -82,17 +81,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "11a188d4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading HadISD data for WMO range: ['000000-029999', '500000-549999', '722000-722999', '800000-849999', '950000-999999']\n" + ] + } + ], "source": [ "print(f\"Downloading HadISD data for WMO range: {wmo_id_ranges}\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "08ac36fd", "metadata": {}, "outputs": [], @@ -153,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "fb79a81c", "metadata": {}, "outputs": [], @@ -183,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "4e43dcc4", "metadata": {}, "outputs": [], @@ -220,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "dcb9b902", "metadata": {}, "outputs": [], @@ -245,7 +252,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "36ad9920", "metadata": {}, "outputs": [], @@ -278,10 +285,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "ffcc5730", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No NetCDF files found for WMO range 000000-029999.\n", + "Starting download for WMO_000000-029999.tar...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "adba6f0822ac409db1863916821d8c62", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading WMO_000000-029999.tar: 0%| | 0.00/1.04G [00:00 Date: Fri, 29 Aug 2025 17:27:19 +0100 Subject: [PATCH 11/20] Update conversion to zarr notebook to show good output --- .../tutorial/HadISD/2_HadISD_to_zarr.ipynb | 3050 ++++++++++++++++- 1 file changed, 3016 insertions(+), 34 deletions(-) diff --git a/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb b/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb index 42c05837..4f052f68 100644 --- a/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb +++ b/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "55a84e6e", "metadata": {}, "outputs": [], @@ -11,9 +11,7 @@ "import pandas as pd\n", "from pathlib import Path\n", "from dask import delayed, compute\n", - "import multiprocessing\n", - "import os\n", - "import warnings" + "import multiprocessing" ] }, { @@ -52,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "209b3b80", "metadata": {}, "outputs": [], @@ -69,7 +67,7 @@ "\n", " # Assign station ID from attributes or filename\n", " station_id = ds.attrs.get(\"station_id\", file_path.stem)\n", - " ds = ds.assign_coords(station_id=station_id)\n", + " ds = ds.assign_coords(station=station_id)\n", "\n", " # Promote lat/lon/elevation to coordinates (if not already)\n", " for coord in [\"latitude\", \"longitude\", \"elevation\"]:\n", @@ -112,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "e8bcbffa", "metadata": {}, "outputs": [], @@ -178,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "0c6ca548", "metadata": {}, "outputs": [], @@ -220,40 +218,50 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "09e40126", "metadata": {}, "outputs": [], "source": [ "# ruff: noqa: F821\n", - "%run Data_Config.ipynb\n", - "print(f\"NetCDF input directory: {input_dir}\")\n", - "print(f\"Zarr output directory: {zarr_output_dir}\")\n", - "print(f\"Date range: {DATE_RANGE}\")" + "%run Data_Config.ipynb" ] }, { "cell_type": "code", - "execution_count": null, - "id": "f5b48e73", - "metadata": {}, - "outputs": [], - "source": [ - "# Supress conversion to zarr warnings\n", - "os.environ[\"PYTHONWARNINGS\"] = \"ignore::UserWarning\"\n", - "warnings.filterwarnings(\"ignore\", message=\".*not part in the Zarr format 3 specification.*\")\n", - "warnings.filterwarnings(\"ignore\", message=\".*vlen-utf8.*\")\n", - "warnings.filterwarnings(\"ignore\", message=\".*dtype \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 8GB\n",
+       "Dimensions:                (station: 18, time: 473352, flagged: 19, test: 71,\n",
+       "                            reporting_v: 19, reporting_t: 1116, reporting_2: 2,\n",
+       "                            coordinate_length: 1)\n",
+       "Coordinates:\n",
+       "    elevation              (station, coordinate_length) float64 144B dask.array<chunksize=(1, 1), meta=np.ndarray>\n",
+       "    latitude               (station, coordinate_length) float64 144B dask.array<chunksize=(1, 1), meta=np.ndarray>\n",
+       "    longitude              (station, coordinate_length) float64 144B dask.array<chunksize=(1, 1), meta=np.ndarray>\n",
+       "  * station                (station) <U12 864B '507270-99999' ... '506320-99999'\n",
+       "  * time                   (time) datetime64[ns] 4MB 1970-01-01 ... 2023-12-3...\n",
+       "Dimensions without coordinates: flagged, test, reporting_v, reporting_t,\n",
+       "                                reporting_2, coordinate_length\n",
+       "Data variables: (12/26)\n",
+       "    cloud_base             (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n",
+       "    dewpoints              (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n",
+       "    flagged_obs            (station, time, flagged) float64 1GB dask.array<chunksize=(1, 29585, 3), meta=np.ndarray>\n",
+       "    high_cloud_cover       (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n",
+       "    low_cloud_cover        (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n",
+       "    mid_cloud_cover        (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n",
+       "    ...                     ...\n",
+       "    stnlp                  (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n",
+       "    temperatures           (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n",
+       "    total_cloud_cover      (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n",
+       "    wind_gust              (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n",
+       "    winddirs               (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n",
+       "    windspeeds             (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n",
+       "Attributes: (12/39)\n",
+       "    Conventions:                 CF-1.6\n",
+       "    Metadata_Conventions:        Unidata Dataset Discovery v1.0, CF Discrete ...\n",
+       "    acknowledgement:             RJHD was supported by the Joint BEIS/Defra M...\n",
+       "    cdm_data_type:               station\n",
+       "    creator_email:               robert.dunn@metoffice.gov.uk\n",
+       "    creator_name:                Robert Dunn\n",
+       "    ...                          ...\n",
+       "    station_id:                  507270-99999\n",
+       "    station_information:         Where station is a composite the station id ...\n",
+       "    summary:                     Quality-controlled, sub-daily, station datas...\n",
+       "    time_coverage_end:           2023-12-31T21:00Z\n",
+       "    time_coverage_start:         1956-08-20T06:00Z\n",
+       "    title:                       HadISD
" + ], + "text/plain": [ + " Size: 8GB\n", + "Dimensions: (station: 18, time: 473352, flagged: 19, test: 71,\n", + " reporting_v: 19, reporting_t: 1116, reporting_2: 2,\n", + " coordinate_length: 1)\n", + "Coordinates:\n", + " elevation (station, coordinate_length) float64 144B dask.array\n", + " latitude (station, coordinate_length) float64 144B dask.array\n", + " longitude (station, coordinate_length) float64 144B dask.array\n", + " * station (station) \n", + " dewpoints (station, time) float64 68MB dask.array\n", + " flagged_obs (station, time, flagged) float64 1GB dask.array\n", + " high_cloud_cover (station, time) float64 68MB dask.array\n", + " low_cloud_cover (station, time) float64 68MB dask.array\n", + " mid_cloud_cover (station, time) float64 68MB dask.array\n", + " ... ...\n", + " stnlp (station, time) float64 68MB dask.array\n", + " temperatures (station, time) float64 68MB dask.array\n", + " total_cloud_cover (station, time) float64 68MB dask.array\n", + " wind_gust (station, time) float64 68MB dask.array\n", + " winddirs (station, time) float64 68MB dask.array\n", + " windspeeds (station, time) float64 68MB dask.array\n", + "Attributes: (12/39)\n", + " Conventions: CF-1.6\n", + " Metadata_Conventions: Unidata Dataset Discovery v1.0, CF Discrete ...\n", + " acknowledgement: RJHD was supported by the Joint BEIS/Defra M...\n", + " cdm_data_type: station\n", + " creator_email: robert.dunn@metoffice.gov.uk\n", + " creator_name: Robert Dunn\n", + " ... ...\n", + " station_id: 507270-99999\n", + " station_information: Where station is a composite the station id ...\n", + " summary: Quality-controlled, sub-daily, station datas...\n", + " time_coverage_end: 2023-12-31T21:00Z\n", + " time_coverage_start: 1956-08-20T06:00Z\n", + " title: HadISD" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ds_combined = load_combined_dataset(zarr_output_dir)\n", "ds_combined" @@ -288,7 +3270,7 @@ "- `HadISD_data/WMO_080000-099999/netcdf/` (raw NetCDF files)\n", "- `HadISD_data/WMO_080000-099999/zarr/` (processed Zarr stores with harmonized time coordinates)\n", "\n", - "This makes it obvious which data is raw and which is ready for fast, parallel analysis." + "This makes it obvious which data is raw and which is ready for fast, parallel analysis. " ] } ], @@ -296,7 +3278,7 @@ "kernelspec": { "display_name": "pyearthtools", "language": "python", - "name": "pyearthtools" + "name": "python3" }, "language_info": { "codemirror_mode": { From d0eb728b3c82145a14a320e6af7e2c0fa498f4c1 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Fri, 29 Aug 2025 17:27:59 +0100 Subject: [PATCH 12/20] Add docstrings --- notebooks/tutorial/HadISD/Pipeline_Config.ipynb | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/notebooks/tutorial/HadISD/Pipeline_Config.ipynb b/notebooks/tutorial/HadISD/Pipeline_Config.ipynb index 71b05a54..4b5298b3 100644 --- a/notebooks/tutorial/HadISD/Pipeline_Config.ipynb +++ b/notebooks/tutorial/HadISD/Pipeline_Config.ipynb @@ -15,7 +15,7 @@ "id": "2307aa00", "metadata": {}, "source": [ - "# Lists and Dictionaries Required For Sustom Pipeline Steps" + "# Lists and Dictionaries Required For Custom Pipeline Steps" ] }, { @@ -62,6 +62,16 @@ "source": [ "# Custom operation to remove redundent coordinates\n", "class SqueezeStationCoordinates(petpipe.Operation):\n", + " \"\"\"\n", + " Squeeze singleton dimensions from specified station-based coordinates in an xarray.Dataset.\n", + "\n", + " This operation is useful for removing unnecessary singleton dimensions (e.g., shape (n, 1))\n", + " from coordinates like latitude, longitude, and elevation, ensuring they are 1D and indexed\n", + " by 'station'.\n", + "\n", + " Args:\n", + " coords (tuple of str): Names of coordinates to squeeze. Defaults to (\"latitude\", \"longitude\", \"elevation\").\n", + " \"\"\"\n", " def __init__(self, coords=(\"latitude\", \"longitude\", \"elevation\")):\n", " super().__init__()\n", " self.coords = coords\n", @@ -74,7 +84,7 @@ " # Undo function added otherwise pyearthtools will complain\n", " def undo_func(self, ds):\n", " # No undo operation needed for this operation\n", - " return ds" + " return ds\n" ] }, { From 78e9e198de7f727430a78ab5533bde27ad897675 Mon Sep 17 00:00:00 2001 From: Tennessee Leeuwenburg Date: Tue, 2 Sep 2025 12:31:31 +1000 Subject: [PATCH 13/20] Improve error message for numpy conversion process --- .../utils/src/pyearthtools/utils/data/converter.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/packages/utils/src/pyearthtools/utils/data/converter.py b/packages/utils/src/pyearthtools/utils/data/converter.py index a8f2e0c2..e8bac483 100644 --- a/packages/utils/src/pyearthtools/utils/data/converter.py +++ b/packages/utils/src/pyearthtools/utils/data/converter.py @@ -158,9 +158,16 @@ def _distill_dataset(self, dataset: XR_OBJECT) -> dict[DISTILL_KEYS, Any]: try: dims[use_shape.index(size)] = coord except ValueError as e: - raise RuntimeError( - "Cannot record coordinate, currently converter can only handle datasets with variables of the same dimensions." - ) from e + + msg = ( + f"Cannot record coordinate '{coord}', currently the conversion can only handle data variables with " + f"the same dimensionality as the dataset coords {list(dataset.coords)}. " + f"Data variable {variables[0]} with dimensions of {dataset[variables[0]].dims} was used to estimate the shape required. " + f"You may need to drop unused coordinates, drop mismatching data variables, or broadcast your data variables onto the " + "coordinates of the dataset yourself as the proper approach is user-defined." + ) + + raise RuntimeError(msg) from e use_shape[use_shape.index(size)] = 1e10 while None in dims: From 23a9851d01b84dee99c3dcbc25c6af0557ca4929 Mon Sep 17 00:00:00 2001 From: Joel Miller Date: Tue, 2 Sep 2025 17:27:21 +0100 Subject: [PATCH 14/20] Run again with good output after variable/numpy fix --- .../tutorial/HadISD/1_HadISD_Download.ipynb | 62 ++++++++++++++----- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb index 71a8b0b3..ad495a08 100644 --- a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb +++ b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb @@ -89,7 +89,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Downloading HadISD data for WMO range: ['000000-029999', '500000-549999', '722000-722999', '800000-849999', '950000-999999']\n" + "Downloading HadISD data for WMO range: ['500000-549999', '722000-722999', '800000-849999']\n" ] } ], @@ -293,19 +293,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "No NetCDF files found for WMO range 000000-029999.\n", - "Starting download for WMO_000000-029999.tar...\n" + "No NetCDF files found for WMO range 500000-549999.\n", + "Starting download for WMO_500000-549999.tar...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "adba6f0822ac409db1863916821d8c62", + "model_id": "274ec4d5674b4781a8657b2c98db1d66", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Downloading WMO_000000-029999.tar: 0%| | 0.00/1.04G [00:00 Date: Tue, 2 Sep 2025 17:33:49 +0100 Subject: [PATCH 15/20] Update pre-processing to remove redundant station_id variable causing numpy issues, also squeeze lat, lon and elev here instead of pipeline --- .../tutorial/HadISD/2_HadISD_to_zarr.ipynb | 298 ++++++++++-------- 1 file changed, 173 insertions(+), 125 deletions(-) diff --git a/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb b/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb index 4f052f68..1072df43 100644 --- a/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb +++ b/notebooks/tutorial/HadISD/2_HadISD_to_zarr.ipynb @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "209b3b80", "metadata": {}, "outputs": [], @@ -58,16 +58,15 @@ "def preprocess_station(file_path, date_range):\n", " \"\"\"Open and preprocess a single NetCDF file.\"\"\"\n", " ds = xr.open_dataset(file_path)\n", - " \n", - " # Clean invalid attributes\n", - " #ds = clean_attrs(ds)\n", "\n", - " if 'input_station_id' in ds:\n", - " ds = ds.drop_vars('input_station_id')\n", + " # Drop redundant or uninformative station ID variables if present\n", + " for var in ['input_station_id', 'station_id']:\n", + " if var in ds:\n", + " ds = ds.drop_vars(var)\n", "\n", " # Assign station ID from attributes or filename\n", " station_id = ds.attrs.get(\"station_id\", file_path.stem)\n", - " ds = ds.assign_coords(station=station_id)\n", + " ds = ds.expand_dims({\"station\": [station_id]})\n", "\n", " # Promote lat/lon/elevation to coordinates (if not already)\n", " for coord in [\"latitude\", \"longitude\", \"elevation\"]:\n", @@ -238,24 +237,43 @@ "output_type": "stream", "text": [ "Starting Dask parallel conversion with 4 workers...\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_504680-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_504680-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_506580-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_506580-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_507450-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_507450-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_506030-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_506030-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_503530-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_503530-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_504420-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_504420-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_507560-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_507560-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_504340-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_504340-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_507270-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_507270-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_507740-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_507740-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_506320-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_506320-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_507880-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_507880-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_505270-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_505270-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_505570-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_505570-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_501360-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_501360-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_502460-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_502460-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_505640-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_505640-99999.zarr\n", - "Converted: hadisd.3.4.0.2023f_19310101-20240101_505480-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_505480-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956250-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956250-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956100-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956100-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_955910-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_955910-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956400-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956400-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956110-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956110-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956290-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956290-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956170-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956170-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956360-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956360-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956410-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956410-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956120-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956120-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956060-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956060-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956210-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956210-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956350-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956350-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956420-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956420-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956380-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956380-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956070-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956070-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956320-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956320-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956390-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956390-99999.zarr\n", + "Conversion complete. 18 new stations converted, 0 already present.\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956250-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956250-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956100-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956100-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_955910-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_955910-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956400-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956400-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956110-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956110-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956290-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956290-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956170-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956170-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956360-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956360-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956410-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956410-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956120-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956120-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956060-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956060-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956210-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956210-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956350-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956350-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956420-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956420-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956380-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956380-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956070-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956070-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956320-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956320-99999.zarr\n", + "Converted: hadisd.3.4.0.2023f_19310101-20240101_956390-99999.nc → hadisd.3.4.0.2023f_19310101-20240101_956390-99999.zarr\n", "Conversion complete. 18 new stations converted, 0 already present.\n" ] } @@ -733,11 +751,11 @@ " elevation (station, coordinate_length) float64 144B dask.array<chunksize=(1, 1), meta=np.ndarray>\n", " latitude (station, coordinate_length) float64 144B dask.array<chunksize=(1, 1), meta=np.ndarray>\n", " longitude (station, coordinate_length) float64 144B dask.array<chunksize=(1, 1), meta=np.ndarray>\n", - " * station (station) <U12 864B '507270-99999' ... '506320-99999'\n", + " * station (station) object 144B '956100-99999' ... '956250-9...\n", " * time (time) datetime64[ns] 4MB 1970-01-01 ... 2023-12-3...\n", "Dimensions without coordinates: flagged, test, reporting_v, reporting_t,\n", " reporting_2, coordinate_length\n", - "Data variables: (12/26)\n", + "Data variables: (12/25)\n", " cloud_base (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n", " dewpoints (station, time) float64 68MB dask.array<chunksize=(1, 59169), meta=np.ndarray>\n", " flagged_obs (station, time, flagged) float64 1GB dask.array<chunksize=(1, 29585, 3), meta=np.ndarray>\n", @@ -759,12 +777,12 @@ " creator_email: robert.dunn@metoffice.gov.uk\n", " creator_name: Robert Dunn\n", " ... ...\n", - " station_id: 507270-99999\n", + " station_id: 956100-99999\n", " station_information: Where station is a composite the station id ...\n", " summary: Quality-controlled, sub-daily, station datas...\n", - " time_coverage_end: 2023-12-31T21:00Z\n", - " time_coverage_start: 1956-08-20T06:00Z\n", - " title: HadISD