From 8fbe2be4b2392576373f4b7264f7583d38992b33 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Mon, 9 Feb 2026 10:52:24 -0500
Subject: [PATCH 1/2] Address PR #505 review feedback

- Extract shared etl_argparser() into utils/db.py to eliminate repeated
  boilerplate across 7 ETL scripts
- Label hardcoded dollar targets with HARDCODED_YEAR = 2024 instead of
  dynamic time_period; add warnings.warn when dataset year differs
- Delete dead get_pseudo_input_variables() and update callers
- Switch DEFAULT_DATASET to local storage path for local-first workflow
- Add promote-dataset Makefile target and HF_CLONE_DIR variable
- Add SOI Congress-session constants with RuntimeError guard for
  future tax-year bumps
- Update Makefile comments for stratified CPS parameters

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 Makefile                                      | 19 +++-
 changelog_entry.yaml                          | 10 ++
 .../calibration_utils.py                      | 24 -----
 .../create_stratified_cps.py                  | 13 ---
 .../stacked_dataset_builder.py                | 13 +--
 .../db/create_initial_strata.py               | 26 +-----
 policyengine_us_data/db/etl_age.py            | 33 ++-----
 policyengine_us_data/db/etl_irs_soi.py        | 73 ++++++---------
 policyengine_us_data/db/etl_medicaid.py       | 32 ++-----
 .../db/etl_national_targets.py                | 91 +++++++++----------
 policyengine_us_data/db/etl_snap.py           | 32 ++-----
 .../db/etl_state_income_tax.py                | 29 +-----
 .../calibration_targets/pull_soi_targets.py   | 13 ++-
 policyengine_us_data/utils/db.py              | 46 +++++++++-
 policyengine_us_data/utils/loss.py            |  8 +-
 15 files changed, 183 insertions(+), 279 deletions(-)

diff --git a/Makefile b/Makefile
index 09d984a96..bfa9bfad2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,6 @@
-.PHONY: all format test install download upload docker documentation data publish-local-area clean build paper clean-paper presentations database database-refresh promote-database
+.PHONY: all format test install download upload docker documentation data publish-local-area clean build paper clean-paper presentations database database-refresh promote-database promote-dataset
+
+HF_CLONE_DIR = $(HOME)/devl/huggingface/policyengine-us-data
 
 all: data test
 
@@ -72,12 +74,17 @@ database-refresh:
 
 promote-database:
 	cp policyengine_us_data/storage/calibration/policy_data.db \
-		$(HOME)/devl/huggingface/policyengine-us-data/calibration/policy_data.db
-	rm -rf $(HOME)/devl/huggingface/policyengine-us-data/calibration/raw_inputs
+		$(HF_CLONE_DIR)/calibration/policy_data.db
+	rm -rf $(HF_CLONE_DIR)/calibration/raw_inputs
 	cp -r policyengine_us_data/storage/calibration/raw_inputs \
-		$(HOME)/devl/huggingface/policyengine-us-data/calibration/raw_inputs
+		$(HF_CLONE_DIR)/calibration/raw_inputs
 	@echo "Copied DB and raw_inputs to HF clone. Now cd to HF repo, commit, and push."
 
+promote-dataset:
+	cp policyengine_us_data/storage/stratified_extended_cps_2024.h5 \
+		$(HF_CLONE_DIR)/calibration/stratified_extended_cps.h5
+	@echo "Copied dataset to HF clone. Now cd to HF repo, commit, and push."
+
 data: download
 	python policyengine_us_data/utils/uprating.py
 	python policyengine_us_data/datasets/acs/acs.py
@@ -87,6 +94,10 @@ data: download
 	python policyengine_us_data/datasets/cps/extended_cps.py
 	python policyengine_us_data/datasets/cps/enhanced_cps.py
 	python policyengine_us_data/datasets/cps/small_enhanced_cps.py
+	# 12000: number of households our GPUs can handle (found via trial and error).
+	# --top=99.5: include only top 0.5% (vs default 1%) to preserve
+	#   representation of lower-income households.
+	# --seed=3526: reproducible stratified sampling.
 	python policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py 12000 --top=99.5 --seed=3526
 
 publish-local-area:
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e69de29bb..a585d4b8a 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -0,0 +1,10 @@
+- bump: patch
+  changes:
+    changed:
+    - Switch DEFAULT_DATASET to local storage path for database ETL scripts
+    - Extract shared etl_argparser() to reduce boilerplate across 7 ETL scripts
+    - Delete dead get_pseudo_input_variables() function
+    added:
+    - promote-dataset Makefile target
+    - Year-mismatch warning in national targets ETL
+    - Congress-session constants and warning in SOI district puller
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
index c27cf3e04..6a5c415ec 100644
--- a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
+++ b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
@@ -248,30 +248,6 @@ def get_calculated_variables(sim) -> List[str]:
     return result
 
 
-def get_pseudo_input_variables(sim) -> set:
-    """
-    Identify pseudo-input variables that should NOT be saved to H5 files.
-
-    NOTE: This function currently returns an empty set. The original logic
-    excluded variables with 'adds' or 'subtracts' attributes, but analysis
-    showed that in CPS data, these variables contain authoritative stored
-    data that does NOT match their component variables:
-
-    - pre_tax_contributions: components are all 0, aggregate has imputed values
-    - tax_exempt_pension_income: aggregate has 135M, components only 20M
-    - taxable_pension_income: aggregate has 82M, components only 29M
-    - interest_deduction: aggregate has 41M, components are 0
-
-    The 'adds' attribute defines how to CALCULATE these values, but in CPS
-    data the stored values are the authoritative source. Excluding them and
-    recalculating from components produces incorrect results.
-
-    For geo-stacking, entity ID reindexing preserves within-entity
-    relationships, so aggregation within a person or tax_unit remains valid.
-    """
-    return set()
-
-
 def apply_op(values: np.ndarray, op: str, val: str) -> np.ndarray:
     """Apply constraint operation to values array."""
     try:
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py b/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py
index ba1011016..39b0b7771 100644
--- a/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py
+++ b/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py
@@ -13,9 +13,6 @@
 from policyengine_us import Microsimulation
 from policyengine_core.data.dataset import Dataset
 from policyengine_core.enums import Enum
-from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
-    get_pseudo_input_variables,
-)
 
 
 def create_stratified_cps_dataset(
@@ -225,16 +222,6 @@ def create_stratified_cps_dataset(
 
     # Only save input variables (not calculated/derived variables)
     input_vars = set(sim.input_variables)
-
-    # Filter out pseudo-inputs: variables with adds/subtracts that aggregate
-    # formula-based components. These have stale values that corrupt calculations.
-    pseudo_inputs = get_pseudo_input_variables(sim)
-    if pseudo_inputs:
-        print(f"Excluding {len(pseudo_inputs)} pseudo-input variables:")
-        for var in sorted(pseudo_inputs):
-            print(f"  - {var}")
-        input_vars = input_vars - pseudo_inputs
-
     print(f"Found {len(input_vars)} input variables to save")
 
     for variable in stratified_sim.tax_benefit_system.variables:
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py
index 209218cf6..010e151f3 100644
--- a/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py
+++ b/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py
@@ -14,7 +14,6 @@
 from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
     get_all_cds_from_database,
     get_calculated_variables,
-    get_pseudo_input_variables,
     STATE_CODES,
     STATE_FIPS_TO_NAME,
     STATE_FIPS_TO_CODE,
@@ -624,17 +623,7 @@ def create_sparse_cd_stacked_dataset(
 
     # Only save input variables (not calculated/derived variables)
     # Calculated variables like state_name, state_code will be recalculated on load
-    input_vars = set(base_sim.input_variables)
-
-    # Filter out pseudo-inputs: variables with adds/subtracts that aggregate
-    # formula-based components. These have stale values that corrupt calculations.
-    pseudo_inputs = get_pseudo_input_variables(base_sim)
-    if pseudo_inputs:
-        print(f"Excluding {len(pseudo_inputs)} pseudo-input variables:")
-        for var in sorted(pseudo_inputs):
-            print(f"  - {var}")
-
-    vars_to_save = input_vars - pseudo_inputs
+    vars_to_save = set(base_sim.input_variables)
     print(f"Found {len(vars_to_save)} input variables to save")
 
     # congressional_district_geoid isn't in the original microdata and has no formula,
diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py
index 8dda76e29..bb17e7472 100644
--- a/policyengine_us_data/db/create_initial_strata.py
+++ b/policyengine_us_data/db/create_initial_strata.py
@@ -1,4 +1,3 @@
-import argparse
 import logging
 from typing import Dict
 
@@ -7,12 +6,11 @@
 from sqlmodel import Session, create_engine
 
 from policyengine_us_data.storage import STORAGE_FOLDER
-
-DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
 )
+from policyengine_us_data.utils.db import etl_argparser
 from policyengine_us_data.utils.raw_cache import (
     is_cached,
     save_json,
@@ -71,27 +69,7 @@ def fetch_congressional_districts(year):
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Create initial geographic strata for calibration"
-    )
-    parser.add_argument(
-        "--dataset",
-        default=DEFAULT_DATASET,
-        help=(
-            "Source dataset (local path or HuggingFace URL). "
-            "The year for Census API calls is derived from the dataset's "
-            "default_calculation_period. Default: %(default)s"
-        ),
-    )
-    args = parser.parse_args()
-
-    # Derive year from dataset
-    from policyengine_us import Microsimulation
-
-    print(f"Loading dataset: {args.dataset}")
-    sim = Microsimulation(dataset=args.dataset)
-    year = int(sim.default_calculation_period)
-    print(f"Derived year from dataset: {year}")
+    _, year = etl_argparser("Create initial geographic strata for calibration")
 
     # State FIPS to name/abbreviation mapping
     STATE_NAMES = {
diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py
index 2e213d92b..74d5ec003 100644
--- a/policyengine_us_data/db/etl_age.py
+++ b/policyengine_us_data/db/etl_age.py
@@ -1,13 +1,8 @@
-import argparse
-
 import pandas as pd
 import numpy as np
 from sqlmodel import Session, create_engine, select
 
 from policyengine_us_data.storage import STORAGE_FOLDER
-
-DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
-
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -15,7 +10,11 @@
     SourceType,
 )
 from policyengine_us_data.utils.census import get_census_docs, pull_acs_table
-from policyengine_us_data.utils.db import parse_ucgid, get_geographic_strata
+from policyengine_us_data.utils.db import (
+    parse_ucgid,
+    get_geographic_strata,
+    etl_argparser,
+)
 from policyengine_us_data.utils.db_metadata import (
     get_or_create_source,
     get_or_create_variable_group,
@@ -287,27 +286,7 @@ def load_age_data(df_long, geo, year):
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="ETL for age calibration targets"
-    )
-    parser.add_argument(
-        "--dataset",
-        default=DEFAULT_DATASET,
-        help=(
-            "Source dataset (local path or HuggingFace URL). "
-            "The year for Census API calls is derived from the dataset's "
-            "default_calculation_period. Default: %(default)s"
-        ),
-    )
-    args = parser.parse_args()
-
-    # Derive year from dataset
-    from policyengine_us import Microsimulation
-
-    print(f"Loading dataset: {args.dataset}")
-    sim = Microsimulation(dataset=args.dataset)
-    year = int(sim.default_calculation_period)
-    print(f"Derived year from dataset: {year}")
+    _, year = etl_argparser("ETL for age calibration targets")
 
     # --- ETL: Extract, Transform, Load ----
 
diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
index 5f191ce3f..490c99a01 100644
--- a/policyengine_us_data/db/etl_irs_soi.py
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -1,4 +1,3 @@
-import argparse
 import logging
 from typing import Optional
 
@@ -8,19 +7,6 @@
 from sqlmodel import Session, create_engine, select
 
 from policyengine_us_data.storage import STORAGE_FOLDER
-
-DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
-
-# IRS SOI data is typically available ~2 years after the tax year
-IRS_SOI_LAG_YEARS = 2
-from policyengine_us_data.utils.raw_cache import (
-    is_cached,
-    cache_path,
-    save_bytes,
-)
-
-logger = logging.getLogger(__name__)
-
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -34,6 +20,7 @@
     get_stratum_parent,
     parse_ucgid,
     get_geographic_strata,
+    etl_argparser,
 )
 from policyengine_us_data.utils.db_metadata import (
     get_or_create_source,
@@ -44,6 +31,17 @@
 from policyengine_us_data.storage.calibration_targets.make_district_mapping import (
     get_district_mapping,
 )
+from policyengine_us_data.utils.raw_cache import (
+    is_cached,
+    cache_path,
+    save_bytes,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# IRS SOI data is typically available ~2 years after the tax year
+IRS_SOI_LAG_YEARS = 2
 
 """See the 22incddocguide.docx manual from the IRS SOI"""
 # Language in the doc: '$10,000 under $25,000' means >= $10,000 and < $25,000
@@ -1236,40 +1234,23 @@ def load_soi_data(long_dfs, year):
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="ETL for IRS SOI calibration targets"
-    )
-    parser.add_argument(
-        "--dataset",
-        default=DEFAULT_DATASET,
-        help=(
-            "Source dataset (local path or HuggingFace URL). "
-            "The year for IRS SOI data is derived from the dataset's "
-            "default_calculation_period minus IRS_SOI_LAG_YEARS. "
-            "Default: %(default)s"
-        ),
-    )
-    parser.add_argument(
-        "--lag",
-        type=int,
-        default=IRS_SOI_LAG_YEARS,
-        help=(
-            "Years to subtract from dataset year for IRS SOI data "
-            "(default: %(default)s, since IRS data is ~2 years behind)"
-        ),
-    )
-    args = parser.parse_args()
-
-    # Derive year from dataset with lag applied
-    from policyengine_us import Microsimulation
+    def add_lag_arg(parser):
+        parser.add_argument(
+            "--lag",
+            type=int,
+            default=IRS_SOI_LAG_YEARS,
+            help=(
+                "Years to subtract from dataset year for IRS SOI data "
+                "(default: %(default)s, since IRS data is ~2 years behind)"
+            ),
+        )
 
-    print(f"Loading dataset: {args.dataset}")
-    sim = Microsimulation(dataset=args.dataset)
-    dataset_year = int(sim.default_calculation_period)
-    year = dataset_year - args.lag
-    print(
-        f"Dataset year: {dataset_year}, IRS SOI year: {year} (lag={args.lag})"
+    args, dataset_year = etl_argparser(
+        "ETL for IRS SOI calibration targets",
+        extra_args_fn=add_lag_arg,
     )
+    year = dataset_year - args.lag
+    print(f"IRS SOI year: {year} (lag={args.lag})")
 
     # Extract -----------------------
     raw_df = extract_soi_data()
diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py
index 435ccd42c..7b34863e0 100644
--- a/policyengine_us_data/db/etl_medicaid.py
+++ b/policyengine_us_data/db/etl_medicaid.py
@@ -1,4 +1,3 @@
-import argparse
 import logging
 
 import requests
@@ -7,9 +6,6 @@
 from sqlmodel import Session, create_engine, select
 
 from policyengine_us_data.storage import STORAGE_FOLDER
-
-DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
-
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -20,7 +16,11 @@
     STATE_ABBREV_TO_FIPS,
     pull_acs_table,
 )
-from policyengine_us_data.utils.db import parse_ucgid, get_geographic_strata
+from policyengine_us_data.utils.db import (
+    parse_ucgid,
+    get_geographic_strata,
+    etl_argparser,
+)
 from policyengine_us_data.utils.db_metadata import (
     get_or_create_source,
     get_or_create_variable_group,
@@ -328,27 +328,7 @@ def load_medicaid_data(long_state, long_cd, year):
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="ETL for Medicaid calibration targets"
-    )
-    parser.add_argument(
-        "--dataset",
-        default=DEFAULT_DATASET,
-        help=(
-            "Source dataset (local path or HuggingFace URL). "
-            "The year for targets is derived from the dataset's "
-            "default_calculation_period. Default: %(default)s"
-        ),
-    )
-    args = parser.parse_args()
-
-    # Derive year from dataset
-    from policyengine_us import Microsimulation
-
-    print(f"Loading dataset: {args.dataset}")
-    sim = Microsimulation(dataset=args.dataset)
-    year = int(sim.default_calculation_period)
-    print(f"Derived year from dataset: {year}")
+    _, year = etl_argparser("ETL for Medicaid calibration targets")
 
     # Extract ------------------------------
     state_admin_df = extract_administrative_medicaid_data(year)
diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py
index fd97b83f4..7688b5705 100644
--- a/policyengine_us_data/db/etl_national_targets.py
+++ b/policyengine_us_data/db/etl_national_targets.py
@@ -1,4 +1,4 @@
-import argparse
+import warnings
 
 from sqlmodel import Session, create_engine
 import pandas as pd
@@ -10,12 +10,14 @@
     Target,
     SourceType,
 )
+from policyengine_us_data.utils.db import (
+    DEFAULT_DATASET,
+    etl_argparser,
+)
 from policyengine_us_data.utils.db_metadata import (
     get_or_create_source,
 )
 
-DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
-
 
 def extract_national_targets(dataset: str = DEFAULT_DATASET):
     """
@@ -46,7 +48,16 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
     time_period = int(sim.default_calculation_period)
     print(f"Derived time_period from dataset: {time_period}")
 
-    # Direct sum targets - use the time_period derived from the dataset
+    # Hardcoded dollar targets are specific to 2024 and should be
+    # labeled as such.  Only CBO/Treasury parameter lookups use the
+    # dynamic time_period derived from the dataset.
+    HARDCODED_YEAR = 2024
+    if time_period != HARDCODED_YEAR:
+        warnings.warn(
+            f"Dataset year ({time_period}) != HARDCODED_YEAR "
+            f"({HARDCODED_YEAR}). Hardcoded dollar targets may "
+            f"be stale and need re-sourcing."
+        )
 
     # Separate tax-related targets that need filer constraint
     tax_filer_targets = [
@@ -55,35 +66,35 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
             "value": 21.247e9,
             "source": "Joint Committee on Taxation",
             "notes": "SALT deduction tax expenditure",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "medical_expense_deduction",
             "value": 11.4e9,
             "source": "Joint Committee on Taxation",
             "notes": "Medical expense deduction tax expenditure",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "charitable_deduction",
             "value": 65.301e9,
             "source": "Joint Committee on Taxation",
             "notes": "Charitable deduction tax expenditure",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "interest_deduction",
             "value": 24.8e9,
             "source": "Joint Committee on Taxation",
             "notes": "Mortgage interest deduction tax expenditure",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "qualified_business_income_deduction",
             "value": 63.1e9,
             "source": "Joint Committee on Taxation",
             "notes": "QBI deduction tax expenditure",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
     ]
 
@@ -93,112 +104,112 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
             "value": 13e9,
             "source": "Survey-reported (post-TCJA grandfathered)",
             "notes": "Alimony received - survey reported, not tax-filer restricted",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "alimony_expense",
             "value": 13e9,
             "source": "Survey-reported (post-TCJA grandfathered)",
             "notes": "Alimony paid - survey reported, not tax-filer restricted",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "medicaid",
             "value": 871.7e9,
             "source": "https://www.cms.gov/files/document/highlights.pdf",
             "notes": "CMS 2023 highlights document - total Medicaid spending",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "net_worth",
             "value": 160e12,
             "source": "Federal Reserve SCF",
             "notes": "Total household net worth",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "health_insurance_premiums_without_medicare_part_b",
             "value": 385e9,
             "source": "MEPS/NHEA",
             "notes": "Health insurance premiums excluding Medicare Part B",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "other_medical_expenses",
             "value": 278e9,
             "source": "MEPS/NHEA",
             "notes": "Out-of-pocket medical expenses",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "medicare_part_b_premiums",
             "value": 112e9,
             "source": "CMS Medicare data",
             "notes": "Medicare Part B premium payments",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "over_the_counter_health_expenses",
             "value": 72e9,
             "source": "Consumer Expenditure Survey",
             "notes": "OTC health products and supplies",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "child_support_expense",
             "value": 33e9,
             "source": "Census Bureau",
             "notes": "Child support payments",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "child_support_received",
             "value": 33e9,
             "source": "Census Bureau",
             "notes": "Child support received",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "spm_unit_capped_work_childcare_expenses",
             "value": 348e9,
             "source": "Census Bureau SPM",
             "notes": "Work and childcare expenses for SPM",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "spm_unit_capped_housing_subsidy",
             "value": 35e9,
             "source": "HUD/Census",
             "notes": "Housing subsidies",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "tanf",
             "value": 9e9,
             "source": "HHS/ACF",
             "notes": "TANF cash assistance",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "real_estate_taxes",
             "value": 500e9,
             "source": "Census Bureau",
             "notes": "Property taxes paid",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "rent",
             "value": 735e9,
             "source": "Census Bureau/BLS",
             "notes": "Rental payments",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "tip_income",
             "value": 53.2e9,
             "source": "IRS Form W-2 Box 7 statistics",
             "notes": "Social security tips uprated 40% to account for underreporting",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         # SSA benefit-type totals derived from trust fund data and
         # SSA fact sheet type shares
@@ -207,28 +218,28 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
             "value": 1_060e9,
             "source": "https://www.ssa.gov/OACT/STATS/table4a3.html",
             "notes": "~73% of total OASDI ($1,452B CBO projection)",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "social_security_disability",
             "value": 148e9,
             "source": "https://www.ssa.gov/OACT/STATS/table4a3.html",
             "notes": "~10.2% of total OASDI (disabled workers)",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "social_security_survivors",
             "value": 160e9,
             "source": "https://www.ssa.gov/OACT/FACTS/",
             "notes": "~11.0% of total OASDI (widows, children of deceased)",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "social_security_dependents",
             "value": 84e9,
             "source": "https://www.ssa.gov/OACT/FACTS/",
             "notes": "~5.8% of total OASDI (spouses/children of retired+disabled)",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         # IRA contribution totals from IRS SOI accumulation tables
         {
@@ -236,14 +247,14 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
             "value": 25e9,
             "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements",
             "notes": "Tax year 2022 (~5M x $4,510 avg) uprated ~12% to 2024",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "variable": "roth_ira_contributions",
             "value": 39e9,
             "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements",
             "notes": "Tax year 2022 (~10M x $3,482 avg) uprated ~12% to 2024",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
     ]
 
@@ -256,7 +267,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
             "person_count": 72_429_055,
             "source": "CMS/HHS administrative data",
             "notes": "Medicaid enrollment count",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
         {
             "constraint_variable": "aca_ptc",
@@ -264,7 +275,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
             "person_count": 19_743_689,
             "source": "CMS marketplace data",
             "notes": "ACA Premium Tax Credit recipients",
-            "year": time_period,
+            "year": HARDCODED_YEAR,
         },
     ]
 
@@ -715,19 +726,7 @@ def load_national_targets(
 
 def main():
     """Main ETL pipeline for national targets."""
-    parser = argparse.ArgumentParser(
-        description="ETL for national calibration targets"
-    )
-    parser.add_argument(
-        "--dataset",
-        default=DEFAULT_DATASET,
-        help=(
-            "Source dataset (local path or HuggingFace URL). "
-            "The time_period for targets is derived from the dataset's "
-            "default_calculation_period. Default: %(default)s"
-        ),
-    )
-    args = parser.parse_args()
+    args, _ = etl_argparser("ETL for national calibration targets")
 
     # Extract
     print("Extracting national targets...")
diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py
index 554f67ec1..ffa5ee6d2 100644
--- a/policyengine_us_data/db/etl_snap.py
+++ b/policyengine_us_data/db/etl_snap.py
@@ -1,4 +1,3 @@
-import argparse
 import logging
 import requests
 import zipfile
@@ -10,9 +9,6 @@
 from sqlmodel import Session, create_engine, select
 
 from policyengine_us_data.storage import STORAGE_FOLDER
-
-DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
-
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -26,7 +22,11 @@
     pull_acs_table,
     STATE_NAME_TO_FIPS,
 )
-from policyengine_us_data.utils.db import parse_ucgid, get_geographic_strata
+from policyengine_us_data.utils.db import (
+    parse_ucgid,
+    get_geographic_strata,
+    etl_argparser,
+)
 from policyengine_us_data.utils.db_metadata import (
     get_or_create_source,
     get_or_create_variable_group,
@@ -367,27 +367,7 @@ def load_survey_snap_data(survey_df, year, snap_stratum_lookup):
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="ETL for SNAP calibration targets"
-    )
-    parser.add_argument(
-        "--dataset",
-        default=DEFAULT_DATASET,
-        help=(
-            "Source dataset (local path or HuggingFace URL). "
-            "The year for targets is derived from the dataset's "
-            "default_calculation_period. Default: %(default)s"
-        ),
-    )
-    args = parser.parse_args()
-
-    # Derive year from dataset
-    from policyengine_us import Microsimulation
-
-    print(f"Loading dataset: {args.dataset}")
-    sim = Microsimulation(dataset=args.dataset)
-    year = int(sim.default_calculation_period)
-    print(f"Derived year from dataset: {year}")
+    _, year = etl_argparser("ETL for SNAP calibration targets")
 
     # Extract ---------
     zip_file_admin = extract_administrative_snap_data()
diff --git a/policyengine_us_data/db/etl_state_income_tax.py b/policyengine_us_data/db/etl_state_income_tax.py
index 9da8d8390..89d941438 100644
--- a/policyengine_us_data/db/etl_state_income_tax.py
+++ b/policyengine_us_data/db/etl_state_income_tax.py
@@ -10,15 +10,12 @@
 Stratum Group ID: 7 (State Income Tax)
 """
 
-import argparse
 import logging
 import pandas as pd
 import numpy as np
 from sqlmodel import Session, create_engine, select
 
 from policyengine_us_data.storage import STORAGE_FOLDER
-
-DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5"
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -28,7 +25,7 @@
     VariableGroup,
     VariableMetadata,
 )
-from policyengine_us_data.utils.db import get_geographic_strata
+from policyengine_us_data.utils.db import get_geographic_strata, etl_argparser
 from policyengine_us_data.utils.db_metadata import (
     get_or_create_source,
     get_or_create_variable_group,
@@ -42,6 +39,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 # Stratum group ID for state income tax targets
 STRATUM_GROUP_ID_STATE_INCOME_TAX = 7
 
@@ -345,32 +343,11 @@ def load_state_income_tax_data(df: pd.DataFrame, year: int) -> dict:
 
 def main():
     """Run the full ETL pipeline for state income tax targets."""
-    parser = argparse.ArgumentParser(
-        description="ETL for state income tax calibration targets"
-    )
-    parser.add_argument(
-        "--dataset",
-        default=DEFAULT_DATASET,
-        help=(
-            "Source dataset (local path or HuggingFace URL). "
-            "The year for targets is derived from the dataset's "
-            "default_calculation_period. Default: %(default)s"
-        ),
-    )
-    args = parser.parse_args()
-
     logging.basicConfig(
         level=logging.INFO,
         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
     )
-
-    # Derive year from dataset
-    from policyengine_us import Microsimulation
-
-    logger.info(f"Loading dataset: {args.dataset}")
-    sim = Microsimulation(dataset=args.dataset)
-    year = int(sim.default_calculation_period)
-    logger.info(f"Derived year from dataset: {year}")
+    _, year = etl_argparser("ETL for state income tax calibration targets")
 
     logger.info(f"Extracting Census STC data for FY{year}...")
     raw_df = extract_state_income_tax_data(year)
diff --git a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py
index c3f159191..59050a1b3 100644
--- a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py
+++ b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py
@@ -38,6 +38,9 @@
     "$500,000 or more": (500_000, np.inf),
 }
 
+SOI_CONGRESS_PREFIX = "5001800US"  # 118th Congress
+SOI_DISTRICT_TAX_YEAR = 2022
+
 NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"}
 NON_VOTING_GEO_IDS = {
     "0400000US72",  # Puerto Rico (state level)
@@ -249,7 +252,15 @@ def pull_district_soi_variable(
     df["CONG_DISTRICT"] = (
         df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2)
     )
-    df["GEO_ID"] = "5001800US" + df["STATEFIPS"] + df["CONG_DISTRICT"]
+    if SOI_DISTRICT_TAX_YEAR >= 2024:
+        raise RuntimeError(
+            f"SOI tax year {SOI_DISTRICT_TAX_YEAR} may need "
+            f"119th Congress districts (5001900US). Update "
+            f"SOI_CONGRESS_PREFIX and remove this check "
+            f"once verified."
+        )
+
+    df["GEO_ID"] = SOI_CONGRESS_PREFIX + df["STATEFIPS"] + df["CONG_DISTRICT"]
     df = df[~df["GEO_ID"].isin(NON_VOTING_GEO_IDS)]
 
     at_large_states = (
diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py
index 4de79c44f..378e230ea 100644
--- a/policyengine_us_data/utils/db.py
+++ b/policyengine_us_data/utils/db.py
@@ -1,4 +1,5 @@
-from typing import Dict, List, Optional
+import argparse
+from typing import Dict, List, Optional, Tuple
 
 from sqlmodel import Session, select
 import sqlalchemy as sa
@@ -7,6 +8,49 @@
     Stratum,
     StratumConstraint,
 )
+from policyengine_us_data.storage import STORAGE_FOLDER
+
+DEFAULT_DATASET = str(STORAGE_FOLDER / "stratified_extended_cps_2024.h5")
+
+
+def etl_argparser(
+    description: str,
+    extra_args_fn=None,
+) -> Tuple[argparse.Namespace, int]:
+    """Shared argument parsing and dataset-year derivation for ETL scripts.
+
+    Args:
+        description: Description for the argparse help text.
+        extra_args_fn: Optional callable that receives the parser to add
+            extra arguments before parsing.
+
+    Returns:
+        (args, year) where *year* is derived from the dataset's
+        ``default_calculation_period``.
+    """
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument(
+        "--dataset",
+        default=DEFAULT_DATASET,
+        help=(
+            "Source dataset (local path or HuggingFace URL). "
+            "The year is derived from the dataset's "
+            "default_calculation_period. Default: %(default)s"
+        ),
+    )
+    if extra_args_fn is not None:
+        extra_args_fn(parser)
+
+    args = parser.parse_args()
+
+    from policyengine_us import Microsimulation
+
+    print(f"Loading dataset: {args.dataset}")
+    sim = Microsimulation(dataset=args.dataset)
+    year = int(sim.default_calculation_period)
+    print(f"Derived year from dataset: {year}")
+
+    return args, year
 
 
 def get_stratum_by_id(session: Session, stratum_id: int) -> Optional[Stratum]:
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
index bb3f8cb88..134e919b3 100644
--- a/policyengine_us_data/utils/loss.py
+++ b/policyengine_us_data/utils/loss.py
@@ -10,9 +10,11 @@
 from policyengine_us_data.utils.soi import pe_to_soi, get_soi
 
 # National calibration targets consumed by build_loss_matrix().
-# These are duplicated in db/etl_national_targets.py which loads them
-# into policy_data.db.  A future PR should wire build_loss_matrix()
-# to read from the database so this dict can be deleted.  See PR #488.
+# These values are specific to 2024 — they should NOT be applied to
+# other years without re-sourcing.  They are duplicated in
+# db/etl_national_targets.py which loads them into policy_data.db.
+# A future PR should wire build_loss_matrix() to read from the
+# database so this dict can be deleted.  See PR #488.
 
 HARD_CODED_TOTALS = {
     "health_insurance_premiums_without_medicare_part_b": 385e9,

From c358db5e97ae921f1fc9087297244848d51e9cbf Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Mon, 9 Feb 2026 12:24:26 -0500
Subject: [PATCH 2/2] Add helpful error when local dataset file is missing

On a fresh checkout without `make data`, the local DEFAULT_DATASET
won't exist. Give a clear FileNotFoundError suggesting `make data`
or `--dataset hf://...` instead of a cryptic load failure.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 policyengine_us_data/utils/db.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py
index 378e230ea..71d2d4e1d 100644
--- a/policyengine_us_data/utils/db.py
+++ b/policyengine_us_data/utils/db.py
@@ -1,4 +1,5 @@
 import argparse
+from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 
 from sqlmodel import Session, select
@@ -43,6 +44,16 @@ def etl_argparser(
 
     args = parser.parse_args()
 
+    if (
+        not args.dataset.startswith("hf://")
+        and not Path(args.dataset).exists()
+    ):
+        raise FileNotFoundError(
+            f"Dataset not found: {args.dataset}\n"
+            f"Either build it locally (`make data`) or pass a "
+            f"HuggingFace URL via --dataset hf://policyengine/..."
+        )
+
     from policyengine_us import Microsimulation
 
     print(f"Loading dataset: {args.dataset}")