ACCESS-Community-Hub · tennlee · Jul 15, 2025 · Apr 11, 2025 · Apr 11, 2025 · Apr 15, 2025
diff --git a/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb b/notebooks/tutorial/HadISD/1_HadISD_Download.ipynb
@@ -0,0 +1,253 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7c7150db",
+   "metadata": {},
+   "source": [
+    "# HadISD Data Download Notebook\n",
+    "\n",
+    "This notebook will help you download a subset of the HadISD dataset directly from the Met Office website. The data will be stored in a user-specified directory (or a sensible default), and extracted for further processing (e.g., conversion to Zarr).\n",
+    "\n",
+    "- **Source:** [HadISD v3.4.0.2023f](https://www.metoffice.gov.uk/hadobs/hadisd/v340_2023f/download.html)\n",
+    "- **Instructions:**\n",
+    "    1. Set the download directory (or use the default).\n",
+    "    2. Download the data using Python's `requests` package.\n",
+    "    3. Extract the `.tar.gz` archive.\n",
+    "    4. The extracted files will be ready for use in the next notebook (`HadISD_to_zarr.ipynb`).\n",
+    "\n",
+    "> **Note:** Download size is large. Ensure you have sufficient disk space and a stable internet connection."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4fb7b1d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from tqdm.auto import tqdm\n",
+    "import tarfile\n",
+    "import gzip\n",
+    "import shutil"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12a6abca",
+   "metadata": {},
+   "source": [
+    "### Retrieve Path to Download Directory\n",
+    "The download location will default to a folder named \"HadISD_data\" in your home directory.<br>\n",
+    "If you want to change this, you can do so in the `Data_config.ipynb` configuration notebook. <br>\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2eadaf27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%run Data_Config.ipynb\n",
+    "print(f\"Data will be downloaded to: {download_dir}\")   "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d833606",
+   "metadata": {},
+   "source": [
+    "### Download HadISD Data\n",
+    "The following code will download the HadISD data files. Some files take longer to download than others depending on time of day. To download different WMO datasets, you can change `wmo_id_range` in the `Data_Config.ipynb` notebook .\n",
+    "\n",
+    "The full list of available data can be found here:\n",
+    "https://www.metoffice.gov.uk/hadobs/hadisd/v340_2023f/download.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "feb8d671",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Explain why stations are split into ranges, file size, and how it's not neccesssary to download all stations. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11a188d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Downloading HadISD data for WMO range: {wmo_id_range}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ddbebda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wmo_id_range = wmo_id_range # This has been defined in HadISD_data_config.ipynb\n",
+    "\n",
+    "wmo_str = f\"WMO_{wmo_id_range}\"\n",
+    "url = f\"https://www.metoffice.gov.uk/hadobs/hadisd/v340_2023f/data/{wmo_str}.tar.gz\"\n",
+    "tar_name = f\"{wmo_str}.tar\"\n",
+    "filename = download_dir / tar_name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08ac36fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get remote file size using HTTP HEAD\n",
+    "head = requests.head(url, allow_redirects=True)\n",
+    "remote_size = int(head.headers.get('content-length', 0))\n",
+    "\n",
+    "local_size = filename.stat().st_size if filename.exists() else 0\n",
+    "\n",
+    "if filename.exists() and local_size == remote_size:\n",
+    "    print(f\"File already fully downloaded: {filename} ({local_size/1024**2:.2f} MB)\")\n",
+    "else:\n",
+    "    headers = {}\n",
+    "    mode = 'wb'\n",
+    "    initial_pos = 0\n",
+    "    if filename.exists() and local_size < remote_size:\n",
+    "        headers['Range'] = f'bytes={local_size}-'\n",
+    "        mode = 'ab'\n",
+    "        initial_pos = local_size\n",
+    "        print(f\"Resuming download for {filename.name} at {local_size/1024**2:.2f} MB...\")\n",
+    "    else:\n",
+    "        print(f\"Starting download for {filename.name}...\")\n",
+    "\n",
+    "    response = requests.get(url, stream=True, headers=headers)\n",
+    "    total = remote_size\n",
+    "\n",
+    "    with open(filename, mode) as f, tqdm(\n",
+    "        desc=f\"Downloading {filename.name}\",\n",
+    "        total=total,\n",
+    "        initial=initial_pos,\n",
+    "        unit='B', unit_scale=True, unit_divisor=1024\n",
+    "    ) as bar:\n",
+    "        for chunk in response.iter_content(chunk_size=8192):\n",
+    "            if chunk:\n",
+    "                f.write(chunk)\n",
+    "                bar.update(len(chunk))\n",
+    "\n",
+    "    final_size = filename.stat().st_size\n",
+    "    if final_size == remote_size:\n",
+    "        print(f\"Download complete: {filename} ({final_size/1024**2:.2f} MB)\")\n",
+    "    else:\n",
+    "        print(f\"Warning: Download incomplete. Local size: {final_size}, Remote size: {remote_size}\")\n",
+    "\n",
+    "# Possibly also add check to see if netcdf files esist for the downloaded tar file, if so then don't download again"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4da19a94",
+   "metadata": {},
+   "source": [
+    "### Extract Tar Files and Move to Netcdf Subfolder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb79a81c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extract_dir = download_dir / tar_name.replace('.tar', '')\n",
+    "extract_dir.mkdir(exist_ok=True)\n",
+    "\n",
+    "extracted_files = list(extract_dir.glob('*'))\n",
+    "if extracted_files:\n",
+    "    print(f\"Extraction directory '{extract_dir}' already contains {len(extracted_files)} files. Skipping extraction.\")\n",
+    "elif filename.exists():\n",
+    "    with tarfile.open(filename, \"r:gz\") as tar:\n",
+    "        tar.extractall(path=extract_dir)\n",
+    "    extracted_files = list(extract_dir.glob('*'))\n",
+    "    if extracted_files:\n",
+    "        print(f\"Extraction successful. {len(extracted_files)} files found in {extract_dir}.\")\n",
+    "        # Delete the tar file after extraction\n",
+    "        filename.unlink()\n",
+    "        print(f\"Deleted tar file: {filename}\")\n",
+    "    else:\n",
+    "        print(f\"Warning: No files extracted to {extract_dir}. Tar file will not be deleted.\")\n",
+    "        raise RuntimeError(\"Extraction failed, tar file not deleted.\")\n",
+    "else:\n",
+    "    print(f\"No tar file found and extraction directory is empty. Nothing to extract.\")\n",
+    "    raise FileNotFoundError(f\"Missing tar file: {filename}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53161550",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create subfolder for netcdf\n",
+    "netcdf_dir = download_dir / \"netcdf\"\n",
+    "netcdf_dir.mkdir(parents=True, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e43dcc4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Move extracted .nc files into netcdf_dir after extraction\n",
+    "num_files = 0\n",
+    "for gz_path in extract_dir.glob('*.nc.gz'):\n",
+    "    nc_path = gz_path.with_suffix('')  # Remove .gz extension\n",
+    "    with gzip.open(gz_path, 'rb') as f_in, open(nc_path, 'wb') as f_out:\n",
+    "        f_out.write(f_in.read())\n",
+    "    gz_path.unlink()  # Delete the .gz file after extraction\n",
+    "    shutil.move(str(nc_path), netcdf_dir / nc_path.name)\n",
+    "    num_files += 1\n",
+    "\n",
+    "print(f\"{num_files} .nc files have been extracted, cleaned up, and moved to the netcdf directory: {netcdf_dir}\")\n",
+    "\n",
+    "# Delete the extraction directory after processing\n",
+    "try:\n",
+    "    shutil.rmtree(extract_dir)\n",
+    "    print(f\"Deleted extraction directory: {extract_dir}\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Could not delete extraction directory {extract_dir}: {e}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pyearthtools",
+   "language": "python",
+   "name": "pyearthtools"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}