From 8fbe2be4b2392576373f4b7264f7583d38992b33 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 9 Feb 2026 10:52:24 -0500 Subject: [PATCH 1/2] Address PR #505 review feedback - Extract shared etl_argparser() into utils/db.py to eliminate repeated boilerplate across 7 ETL scripts - Label hardcoded dollar targets with HARDCODED_YEAR = 2024 instead of dynamic time_period; add warnings.warn when dataset year differs - Delete dead get_pseudo_input_variables() and update callers - Switch DEFAULT_DATASET to local storage path for local-first workflow - Add promote-dataset Makefile target and HF_CLONE_DIR variable - Add SOI Congress-session constants with RuntimeError guard for future tax-year bumps - Update Makefile comments for stratified CPS parameters Co-Authored-By: Claude Opus 4.6 --- Makefile | 19 +++- changelog_entry.yaml | 10 ++ .../calibration_utils.py | 24 ----- .../create_stratified_cps.py | 13 --- .../stacked_dataset_builder.py | 13 +-- .../db/create_initial_strata.py | 26 +----- policyengine_us_data/db/etl_age.py | 33 ++----- policyengine_us_data/db/etl_irs_soi.py | 73 ++++++--------- policyengine_us_data/db/etl_medicaid.py | 32 ++----- .../db/etl_national_targets.py | 91 +++++++++---------- policyengine_us_data/db/etl_snap.py | 32 ++----- .../db/etl_state_income_tax.py | 29 +----- .../calibration_targets/pull_soi_targets.py | 13 ++- policyengine_us_data/utils/db.py | 46 +++++++++- policyengine_us_data/utils/loss.py | 8 +- 15 files changed, 183 insertions(+), 279 deletions(-) diff --git a/Makefile b/Makefile index 09d984a96..bfa9bfad2 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,6 @@ -.PHONY: all format test install download upload docker documentation data publish-local-area clean build paper clean-paper presentations database database-refresh promote-database +.PHONY: all format test install download upload docker documentation data publish-local-area clean build paper clean-paper presentations database database-refresh promote-database promote-dataset + +HF_CLONE_DIR = $(HOME)/devl/huggingface/policyengine-us-data all: data test @@ -72,12 +74,17 @@ database-refresh: promote-database: cp policyengine_us_data/storage/calibration/policy_data.db \ - $(HOME)/devl/huggingface/policyengine-us-data/calibration/policy_data.db - rm -rf $(HOME)/devl/huggingface/policyengine-us-data/calibration/raw_inputs + $(HF_CLONE_DIR)/calibration/policy_data.db + rm -rf $(HF_CLONE_DIR)/calibration/raw_inputs cp -r policyengine_us_data/storage/calibration/raw_inputs \ - $(HOME)/devl/huggingface/policyengine-us-data/calibration/raw_inputs + $(HF_CLONE_DIR)/calibration/raw_inputs @echo "Copied DB and raw_inputs to HF clone. Now cd to HF repo, commit, and push." +promote-dataset: + cp policyengine_us_data/storage/stratified_extended_cps_2024.h5 \ + $(HF_CLONE_DIR)/calibration/stratified_extended_cps.h5 + @echo "Copied dataset to HF clone. Now cd to HF repo, commit, and push." + data: download python policyengine_us_data/utils/uprating.py python policyengine_us_data/datasets/acs/acs.py @@ -87,6 +94,10 @@ data: download python policyengine_us_data/datasets/cps/extended_cps.py python policyengine_us_data/datasets/cps/enhanced_cps.py python policyengine_us_data/datasets/cps/small_enhanced_cps.py + # 12000: number of households our GPUs can handle (found via trial and error). + # --top=99.5: include only top 0.5% (vs default 1%) to preserve + # representation of lower-income households. + # --seed=3526: reproducible stratified sampling. python policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py 12000 --top=99.5 --seed=3526 publish-local-area: diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29bb..a585d4b8a 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,10 @@ +- bump: patch + changes: + changed: + - Switch DEFAULT_DATASET to local storage path for database ETL scripts + - Extract shared etl_argparser() to reduce boilerplate across 7 ETL scripts + - Delete dead get_pseudo_input_variables() function + added: + - promote-dataset Makefile target + - Year-mismatch warning in national targets ETL + - Congress-session constants and warning in SOI district puller diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py index c27cf3e04..6a5c415ec 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py @@ -248,30 +248,6 @@ def get_calculated_variables(sim) -> List[str]: return result -def get_pseudo_input_variables(sim) -> set: - """ - Identify pseudo-input variables that should NOT be saved to H5 files. - - NOTE: This function currently returns an empty set. The original logic - excluded variables with 'adds' or 'subtracts' attributes, but analysis - showed that in CPS data, these variables contain authoritative stored - data that does NOT match their component variables: - - - pre_tax_contributions: components are all 0, aggregate has imputed values - - tax_exempt_pension_income: aggregate has 135M, components only 20M - - taxable_pension_income: aggregate has 82M, components only 29M - - interest_deduction: aggregate has 41M, components are 0 - - The 'adds' attribute defines how to CALCULATE these values, but in CPS - data the stored values are the authoritative source. Excluding them and - recalculating from components produces incorrect results. - - For geo-stacking, entity ID reindexing preserves within-entity - relationships, so aggregation within a person or tax_unit remains valid. - """ - return set() - - def apply_op(values: np.ndarray, op: str, val: str) -> np.ndarray: """Apply constraint operation to values array.""" try: diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py b/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py index ba1011016..39b0b7771 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py @@ -13,9 +13,6 @@ from policyengine_us import Microsimulation from policyengine_core.data.dataset import Dataset from policyengine_core.enums import Enum -from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import ( - get_pseudo_input_variables, -) def create_stratified_cps_dataset( @@ -225,16 +222,6 @@ def create_stratified_cps_dataset( # Only save input variables (not calculated/derived variables) input_vars = set(sim.input_variables) - - # Filter out pseudo-inputs: variables with adds/subtracts that aggregate - # formula-based components. These have stale values that corrupt calculations. - pseudo_inputs = get_pseudo_input_variables(sim) - if pseudo_inputs: - print(f"Excluding {len(pseudo_inputs)} pseudo-input variables:") - for var in sorted(pseudo_inputs): - print(f" - {var}") - input_vars = input_vars - pseudo_inputs - print(f"Found {len(input_vars)} input variables to save") for variable in stratified_sim.tax_benefit_system.variables: diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py index 209218cf6..010e151f3 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py @@ -14,7 +14,6 @@ from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import ( get_all_cds_from_database, get_calculated_variables, - get_pseudo_input_variables, STATE_CODES, STATE_FIPS_TO_NAME, STATE_FIPS_TO_CODE, @@ -624,17 +623,7 @@ def create_sparse_cd_stacked_dataset( # Only save input variables (not calculated/derived variables) # Calculated variables like state_name, state_code will be recalculated on load - input_vars = set(base_sim.input_variables) - - # Filter out pseudo-inputs: variables with adds/subtracts that aggregate - # formula-based components. These have stale values that corrupt calculations. - pseudo_inputs = get_pseudo_input_variables(base_sim) - if pseudo_inputs: - print(f"Excluding {len(pseudo_inputs)} pseudo-input variables:") - for var in sorted(pseudo_inputs): - print(f" - {var}") - - vars_to_save = input_vars - pseudo_inputs + vars_to_save = set(base_sim.input_variables) print(f"Found {len(vars_to_save)} input variables to save") # congressional_district_geoid isn't in the original microdata and has no formula, diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py index 8dda76e29..bb17e7472 100644 --- a/policyengine_us_data/db/create_initial_strata.py +++ b/policyengine_us_data/db/create_initial_strata.py @@ -1,4 +1,3 @@ -import argparse import logging from typing import Dict @@ -7,12 +6,11 @@ from sqlmodel import Session, create_engine from policyengine_us_data.storage import STORAGE_FOLDER - -DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, ) +from policyengine_us_data.utils.db import etl_argparser from policyengine_us_data.utils.raw_cache import ( is_cached, save_json, @@ -71,27 +69,7 @@ def fetch_congressional_districts(year): def main(): - parser = argparse.ArgumentParser( - description="Create initial geographic strata for calibration" - ) - parser.add_argument( - "--dataset", - default=DEFAULT_DATASET, - help=( - "Source dataset (local path or HuggingFace URL). " - "The year for Census API calls is derived from the dataset's " - "default_calculation_period. Default: %(default)s" - ), - ) - args = parser.parse_args() - - # Derive year from dataset - from policyengine_us import Microsimulation - - print(f"Loading dataset: {args.dataset}") - sim = Microsimulation(dataset=args.dataset) - year = int(sim.default_calculation_period) - print(f"Derived year from dataset: {year}") + _, year = etl_argparser("Create initial geographic strata for calibration") # State FIPS to name/abbreviation mapping STATE_NAMES = { diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index 2e213d92b..74d5ec003 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -1,13 +1,8 @@ -import argparse - import pandas as pd import numpy as np from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER - -DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" - from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -15,7 +10,11 @@ SourceType, ) from policyengine_us_data.utils.census import get_census_docs, pull_acs_table -from policyengine_us_data.utils.db import parse_ucgid, get_geographic_strata +from policyengine_us_data.utils.db import ( + parse_ucgid, + get_geographic_strata, + etl_argparser, +) from policyengine_us_data.utils.db_metadata import ( get_or_create_source, get_or_create_variable_group, @@ -287,27 +286,7 @@ def load_age_data(df_long, geo, year): def main(): - parser = argparse.ArgumentParser( - description="ETL for age calibration targets" - ) - parser.add_argument( - "--dataset", - default=DEFAULT_DATASET, - help=( - "Source dataset (local path or HuggingFace URL). " - "The year for Census API calls is derived from the dataset's " - "default_calculation_period. Default: %(default)s" - ), - ) - args = parser.parse_args() - - # Derive year from dataset - from policyengine_us import Microsimulation - - print(f"Loading dataset: {args.dataset}") - sim = Microsimulation(dataset=args.dataset) - year = int(sim.default_calculation_period) - print(f"Derived year from dataset: {year}") + _, year = etl_argparser("ETL for age calibration targets") # --- ETL: Extract, Transform, Load ---- diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index 5f191ce3f..490c99a01 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -1,4 +1,3 @@ -import argparse import logging from typing import Optional @@ -8,19 +7,6 @@ from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER - -DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" - -# IRS SOI data is typically available ~2 years after the tax year -IRS_SOI_LAG_YEARS = 2 -from policyengine_us_data.utils.raw_cache import ( - is_cached, - cache_path, - save_bytes, -) - -logger = logging.getLogger(__name__) - from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -34,6 +20,7 @@ get_stratum_parent, parse_ucgid, get_geographic_strata, + etl_argparser, ) from policyengine_us_data.utils.db_metadata import ( get_or_create_source, @@ -44,6 +31,17 @@ from policyengine_us_data.storage.calibration_targets.make_district_mapping import ( get_district_mapping, ) +from policyengine_us_data.utils.raw_cache import ( + is_cached, + cache_path, + save_bytes, +) + +logger = logging.getLogger(__name__) + + +# IRS SOI data is typically available ~2 years after the tax year +IRS_SOI_LAG_YEARS = 2 """See the 22incddocguide.docx manual from the IRS SOI""" # Language in the doc: '$10,000 under $25,000' means >= $10,000 and < $25,000 @@ -1236,40 +1234,23 @@ def load_soi_data(long_dfs, year): def main(): - parser = argparse.ArgumentParser( - description="ETL for IRS SOI calibration targets" - ) - parser.add_argument( - "--dataset", - default=DEFAULT_DATASET, - help=( - "Source dataset (local path or HuggingFace URL). " - "The year for IRS SOI data is derived from the dataset's " - "default_calculation_period minus IRS_SOI_LAG_YEARS. " - "Default: %(default)s" - ), - ) - parser.add_argument( - "--lag", - type=int, - default=IRS_SOI_LAG_YEARS, - help=( - "Years to subtract from dataset year for IRS SOI data " - "(default: %(default)s, since IRS data is ~2 years behind)" - ), - ) - args = parser.parse_args() - - # Derive year from dataset with lag applied - from policyengine_us import Microsimulation + def add_lag_arg(parser): + parser.add_argument( + "--lag", + type=int, + default=IRS_SOI_LAG_YEARS, + help=( + "Years to subtract from dataset year for IRS SOI data " + "(default: %(default)s, since IRS data is ~2 years behind)" + ), + ) - print(f"Loading dataset: {args.dataset}") - sim = Microsimulation(dataset=args.dataset) - dataset_year = int(sim.default_calculation_period) - year = dataset_year - args.lag - print( - f"Dataset year: {dataset_year}, IRS SOI year: {year} (lag={args.lag})" + args, dataset_year = etl_argparser( + "ETL for IRS SOI calibration targets", + extra_args_fn=add_lag_arg, ) + year = dataset_year - args.lag + print(f"IRS SOI year: {year} (lag={args.lag})") # Extract ----------------------- raw_df = extract_soi_data() diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py index 435ccd42c..7b34863e0 100644 --- a/policyengine_us_data/db/etl_medicaid.py +++ b/policyengine_us_data/db/etl_medicaid.py @@ -1,4 +1,3 @@ -import argparse import logging import requests @@ -7,9 +6,6 @@ from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER - -DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" - from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -20,7 +16,11 @@ STATE_ABBREV_TO_FIPS, pull_acs_table, ) -from policyengine_us_data.utils.db import parse_ucgid, get_geographic_strata +from policyengine_us_data.utils.db import ( + parse_ucgid, + get_geographic_strata, + etl_argparser, +) from policyengine_us_data.utils.db_metadata import ( get_or_create_source, get_or_create_variable_group, @@ -328,27 +328,7 @@ def load_medicaid_data(long_state, long_cd, year): def main(): - parser = argparse.ArgumentParser( - description="ETL for Medicaid calibration targets" - ) - parser.add_argument( - "--dataset", - default=DEFAULT_DATASET, - help=( - "Source dataset (local path or HuggingFace URL). " - "The year for targets is derived from the dataset's " - "default_calculation_period. Default: %(default)s" - ), - ) - args = parser.parse_args() - - # Derive year from dataset - from policyengine_us import Microsimulation - - print(f"Loading dataset: {args.dataset}") - sim = Microsimulation(dataset=args.dataset) - year = int(sim.default_calculation_period) - print(f"Derived year from dataset: {year}") + _, year = etl_argparser("ETL for Medicaid calibration targets") # Extract ------------------------------ state_admin_df = extract_administrative_medicaid_data(year) diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index fd97b83f4..7688b5705 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -1,4 +1,4 @@ -import argparse +import warnings from sqlmodel import Session, create_engine import pandas as pd @@ -10,12 +10,14 @@ Target, SourceType, ) +from policyengine_us_data.utils.db import ( + DEFAULT_DATASET, + etl_argparser, +) from policyengine_us_data.utils.db_metadata import ( get_or_create_source, ) -DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" - def extract_national_targets(dataset: str = DEFAULT_DATASET): """ @@ -46,7 +48,16 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): time_period = int(sim.default_calculation_period) print(f"Derived time_period from dataset: {time_period}") - # Direct sum targets - use the time_period derived from the dataset + # Hardcoded dollar targets are specific to 2024 and should be + # labeled as such. Only CBO/Treasury parameter lookups use the + # dynamic time_period derived from the dataset. + HARDCODED_YEAR = 2024 + if time_period != HARDCODED_YEAR: + warnings.warn( + f"Dataset year ({time_period}) != HARDCODED_YEAR " + f"({HARDCODED_YEAR}). Hardcoded dollar targets may " + f"be stale and need re-sourcing." + ) # Separate tax-related targets that need filer constraint tax_filer_targets = [ @@ -55,35 +66,35 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "value": 21.247e9, "source": "Joint Committee on Taxation", "notes": "SALT deduction tax expenditure", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "medical_expense_deduction", "value": 11.4e9, "source": "Joint Committee on Taxation", "notes": "Medical expense deduction tax expenditure", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "charitable_deduction", "value": 65.301e9, "source": "Joint Committee on Taxation", "notes": "Charitable deduction tax expenditure", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "interest_deduction", "value": 24.8e9, "source": "Joint Committee on Taxation", "notes": "Mortgage interest deduction tax expenditure", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "qualified_business_income_deduction", "value": 63.1e9, "source": "Joint Committee on Taxation", "notes": "QBI deduction tax expenditure", - "year": time_period, + "year": HARDCODED_YEAR, }, ] @@ -93,112 +104,112 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "value": 13e9, "source": "Survey-reported (post-TCJA grandfathered)", "notes": "Alimony received - survey reported, not tax-filer restricted", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "alimony_expense", "value": 13e9, "source": "Survey-reported (post-TCJA grandfathered)", "notes": "Alimony paid - survey reported, not tax-filer restricted", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "medicaid", "value": 871.7e9, "source": "https://www.cms.gov/files/document/highlights.pdf", "notes": "CMS 2023 highlights document - total Medicaid spending", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "net_worth", "value": 160e12, "source": "Federal Reserve SCF", "notes": "Total household net worth", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "health_insurance_premiums_without_medicare_part_b", "value": 385e9, "source": "MEPS/NHEA", "notes": "Health insurance premiums excluding Medicare Part B", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "other_medical_expenses", "value": 278e9, "source": "MEPS/NHEA", "notes": "Out-of-pocket medical expenses", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "medicare_part_b_premiums", "value": 112e9, "source": "CMS Medicare data", "notes": "Medicare Part B premium payments", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "over_the_counter_health_expenses", "value": 72e9, "source": "Consumer Expenditure Survey", "notes": "OTC health products and supplies", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "child_support_expense", "value": 33e9, "source": "Census Bureau", "notes": "Child support payments", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "child_support_received", "value": 33e9, "source": "Census Bureau", "notes": "Child support received", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "spm_unit_capped_work_childcare_expenses", "value": 348e9, "source": "Census Bureau SPM", "notes": "Work and childcare expenses for SPM", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "spm_unit_capped_housing_subsidy", "value": 35e9, "source": "HUD/Census", "notes": "Housing subsidies", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "tanf", "value": 9e9, "source": "HHS/ACF", "notes": "TANF cash assistance", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "real_estate_taxes", "value": 500e9, "source": "Census Bureau", "notes": "Property taxes paid", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "rent", "value": 735e9, "source": "Census Bureau/BLS", "notes": "Rental payments", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "tip_income", "value": 53.2e9, "source": "IRS Form W-2 Box 7 statistics", "notes": "Social security tips uprated 40% to account for underreporting", - "year": time_period, + "year": HARDCODED_YEAR, }, # SSA benefit-type totals derived from trust fund data and # SSA fact sheet type shares @@ -207,28 +218,28 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "value": 1_060e9, "source": "https://www.ssa.gov/OACT/STATS/table4a3.html", "notes": "~73% of total OASDI ($1,452B CBO projection)", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "social_security_disability", "value": 148e9, "source": "https://www.ssa.gov/OACT/STATS/table4a3.html", "notes": "~10.2% of total OASDI (disabled workers)", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "social_security_survivors", "value": 160e9, "source": "https://www.ssa.gov/OACT/FACTS/", "notes": "~11.0% of total OASDI (widows, children of deceased)", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "social_security_dependents", "value": 84e9, "source": "https://www.ssa.gov/OACT/FACTS/", "notes": "~5.8% of total OASDI (spouses/children of retired+disabled)", - "year": time_period, + "year": HARDCODED_YEAR, }, # IRA contribution totals from IRS SOI accumulation tables { @@ -236,14 +247,14 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "value": 25e9, "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", "notes": "Tax year 2022 (~5M x $4,510 avg) uprated ~12% to 2024", - "year": time_period, + "year": HARDCODED_YEAR, }, { "variable": "roth_ira_contributions", "value": 39e9, "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", "notes": "Tax year 2022 (~10M x $3,482 avg) uprated ~12% to 2024", - "year": time_period, + "year": HARDCODED_YEAR, }, ] @@ -256,7 +267,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "person_count": 72_429_055, "source": "CMS/HHS administrative data", "notes": "Medicaid enrollment count", - "year": time_period, + "year": HARDCODED_YEAR, }, { "constraint_variable": "aca_ptc", @@ -264,7 +275,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "person_count": 19_743_689, "source": "CMS marketplace data", "notes": "ACA Premium Tax Credit recipients", - "year": time_period, + "year": HARDCODED_YEAR, }, ] @@ -715,19 +726,7 @@ def load_national_targets( def main(): """Main ETL pipeline for national targets.""" - parser = argparse.ArgumentParser( - description="ETL for national calibration targets" - ) - parser.add_argument( - "--dataset", - default=DEFAULT_DATASET, - help=( - "Source dataset (local path or HuggingFace URL). " - "The time_period for targets is derived from the dataset's " - "default_calculation_period. Default: %(default)s" - ), - ) - args = parser.parse_args() + args, _ = etl_argparser("ETL for national calibration targets") # Extract print("Extracting national targets...") diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index 554f67ec1..ffa5ee6d2 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -1,4 +1,3 @@ -import argparse import logging import requests import zipfile @@ -10,9 +9,6 @@ from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER - -DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" - from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -26,7 +22,11 @@ pull_acs_table, STATE_NAME_TO_FIPS, ) -from policyengine_us_data.utils.db import parse_ucgid, get_geographic_strata +from policyengine_us_data.utils.db import ( + parse_ucgid, + get_geographic_strata, + etl_argparser, +) from policyengine_us_data.utils.db_metadata import ( get_or_create_source, get_or_create_variable_group, @@ -367,27 +367,7 @@ def load_survey_snap_data(survey_df, year, snap_stratum_lookup): def main(): - parser = argparse.ArgumentParser( - description="ETL for SNAP calibration targets" - ) - parser.add_argument( - "--dataset", - default=DEFAULT_DATASET, - help=( - "Source dataset (local path or HuggingFace URL). " - "The year for targets is derived from the dataset's " - "default_calculation_period. Default: %(default)s" - ), - ) - args = parser.parse_args() - - # Derive year from dataset - from policyengine_us import Microsimulation - - print(f"Loading dataset: {args.dataset}") - sim = Microsimulation(dataset=args.dataset) - year = int(sim.default_calculation_period) - print(f"Derived year from dataset: {year}") + _, year = etl_argparser("ETL for SNAP calibration targets") # Extract --------- zip_file_admin = extract_administrative_snap_data() diff --git a/policyengine_us_data/db/etl_state_income_tax.py b/policyengine_us_data/db/etl_state_income_tax.py index 9da8d8390..89d941438 100644 --- a/policyengine_us_data/db/etl_state_income_tax.py +++ b/policyengine_us_data/db/etl_state_income_tax.py @@ -10,15 +10,12 @@ Stratum Group ID: 7 (State Income Tax) """ -import argparse import logging import pandas as pd import numpy as np from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER - -DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -28,7 +25,7 @@ VariableGroup, VariableMetadata, ) -from policyengine_us_data.utils.db import get_geographic_strata +from policyengine_us_data.utils.db import get_geographic_strata, etl_argparser from policyengine_us_data.utils.db_metadata import ( get_or_create_source, get_or_create_variable_group, @@ -42,6 +39,7 @@ logger = logging.getLogger(__name__) + # Stratum group ID for state income tax targets STRATUM_GROUP_ID_STATE_INCOME_TAX = 7 @@ -345,32 +343,11 @@ def load_state_income_tax_data(df: pd.DataFrame, year: int) -> dict: def main(): """Run the full ETL pipeline for state income tax targets.""" - parser = argparse.ArgumentParser( - description="ETL for state income tax calibration targets" - ) - parser.add_argument( - "--dataset", - default=DEFAULT_DATASET, - help=( - "Source dataset (local path or HuggingFace URL). " - "The year for targets is derived from the dataset's " - "default_calculation_period. Default: %(default)s" - ), - ) - args = parser.parse_args() - logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) - - # Derive year from dataset - from policyengine_us import Microsimulation - - logger.info(f"Loading dataset: {args.dataset}") - sim = Microsimulation(dataset=args.dataset) - year = int(sim.default_calculation_period) - logger.info(f"Derived year from dataset: {year}") + _, year = etl_argparser("ETL for state income tax calibration targets") logger.info(f"Extracting Census STC data for FY{year}...") raw_df = extract_state_income_tax_data(year) diff --git a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py index c3f159191..59050a1b3 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py @@ -38,6 +38,9 @@ "$500,000 or more": (500_000, np.inf), } +SOI_CONGRESS_PREFIX = "5001800US" # 118th Congress +SOI_DISTRICT_TAX_YEAR = 2022 + NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"} NON_VOTING_GEO_IDS = { "0400000US72", # Puerto Rico (state level) @@ -249,7 +252,15 @@ def pull_district_soi_variable( df["CONG_DISTRICT"] = ( df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2) ) - df["GEO_ID"] = "5001800US" + df["STATEFIPS"] + df["CONG_DISTRICT"] + if SOI_DISTRICT_TAX_YEAR >= 2024: + raise RuntimeError( + f"SOI tax year {SOI_DISTRICT_TAX_YEAR} may need " + f"119th Congress districts (5001900US). Update " + f"SOI_CONGRESS_PREFIX and remove this check " + f"once verified." + ) + + df["GEO_ID"] = SOI_CONGRESS_PREFIX + df["STATEFIPS"] + df["CONG_DISTRICT"] df = df[~df["GEO_ID"].isin(NON_VOTING_GEO_IDS)] at_large_states = ( diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py index 4de79c44f..378e230ea 100644 --- a/policyengine_us_data/utils/db.py +++ b/policyengine_us_data/utils/db.py @@ -1,4 +1,5 @@ -from typing import Dict, List, Optional +import argparse +from typing import Dict, List, Optional, Tuple from sqlmodel import Session, select import sqlalchemy as sa @@ -7,6 +8,49 @@ Stratum, StratumConstraint, ) +from policyengine_us_data.storage import STORAGE_FOLDER + +DEFAULT_DATASET = str(STORAGE_FOLDER / "stratified_extended_cps_2024.h5") + + +def etl_argparser( + description: str, + extra_args_fn=None, +) -> Tuple[argparse.Namespace, int]: + """Shared argument parsing and dataset-year derivation for ETL scripts. + + Args: + description: Description for the argparse help text. + extra_args_fn: Optional callable that receives the parser to add + extra arguments before parsing. + + Returns: + (args, year) where *year* is derived from the dataset's + ``default_calculation_period``. + """ + parser = argparse.ArgumentParser(description=description) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + if extra_args_fn is not None: + extra_args_fn(parser) + + args = parser.parse_args() + + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") + + return args, year def get_stratum_by_id(session: Session, stratum_id: int) -> Optional[Stratum]: diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index bb3f8cb88..134e919b3 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -10,9 +10,11 @@ from policyengine_us_data.utils.soi import pe_to_soi, get_soi # National calibration targets consumed by build_loss_matrix(). -# These are duplicated in db/etl_national_targets.py which loads them -# into policy_data.db. A future PR should wire build_loss_matrix() -# to read from the database so this dict can be deleted. See PR #488. +# These values are specific to 2024 — they should NOT be applied to +# other years without re-sourcing. They are duplicated in +# db/etl_national_targets.py which loads them into policy_data.db. +# A future PR should wire build_loss_matrix() to read from the +# database so this dict can be deleted. See PR #488. HARD_CODED_TOTALS = { "health_insurance_premiums_without_medicare_part_b": 385e9, From c358db5e97ae921f1fc9087297244848d51e9cbf Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 9 Feb 2026 12:24:26 -0500 Subject: [PATCH 2/2] Add helpful error when local dataset file is missing On a fresh checkout without `make data`, the local DEFAULT_DATASET won't exist. Give a clear FileNotFoundError suggesting `make data` or `--dataset hf://...` instead of a cryptic load failure. Co-Authored-By: Claude Opus 4.6 --- policyengine_us_data/utils/db.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py index 378e230ea..71d2d4e1d 100644 --- a/policyengine_us_data/utils/db.py +++ b/policyengine_us_data/utils/db.py @@ -1,4 +1,5 @@ import argparse +from pathlib import Path from typing import Dict, List, Optional, Tuple from sqlmodel import Session, select @@ -43,6 +44,16 @@ def etl_argparser( args = parser.parse_args() + if ( + not args.dataset.startswith("hf://") + and not Path(args.dataset).exists() + ): + raise FileNotFoundError( + f"Dataset not found: {args.dataset}\n" + f"Either build it locally (`make data`) or pass a " + f"HuggingFace URL via --dataset hf://policyengine/..." + ) + from policyengine_us import Microsimulation print(f"Loading dataset: {args.dataset}")