From 5e35d4dcc4d4a0d4a96ca66b975b20bf29ca8108 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 2 Feb 2026 10:36:17 -0500 Subject: [PATCH 1/8] Fix stale calibration targets by deriving time_period from dataset - Remove hardcoded CBO_YEAR and TREASURY_YEAR constants - Add --dataset CLI argument to etl_national_targets.py - Derive time_period from sim.default_calculation_period - Default to HuggingFace production dataset The dataset itself is now the single source of truth for the calibration year, preventing future drift when updating to new base years. Closes #503 Co-Authored-By: Claude Opus 4.5 --- changelog_entry.yaml | 3 + .../db/etl_national_targets.py | 117 +++++++++++------- 2 files changed, 73 insertions(+), 47 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29bb..a2210db7e 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,3 @@ +- date: 2026-02-02 + type: fixed + description: Fix stale 2022-2023 calibration targets in policy_data.db by deriving time_period from the dataset instead of hardcoding year constants diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index 7e02d6f09..fd97b83f4 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -1,3 +1,5 @@ +import argparse + from sqlmodel import Session, create_engine import pandas as pd @@ -12,11 +14,19 @@ get_or_create_source, ) +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + -def extract_national_targets(): +def extract_national_targets(dataset: str = DEFAULT_DATASET): """ Extract national calibration targets from various sources. + Parameters + ---------- + dataset : str + Path to the calibration dataset (local path or HuggingFace URL). + The time period is derived from the dataset's default_calculation_period. + Returns ------- dict @@ -26,18 +36,17 @@ def extract_national_targets(): - conditional_count_targets: Enrollment counts requiring constraints - cbo_targets: List of CBO projection targets - treasury_targets: List of Treasury/JCT targets + - time_period: The year derived from the dataset """ - - # Initialize PolicyEngine for parameter access from policyengine_us import Microsimulation - sim = Microsimulation( - dataset="hf://policyengine/policyengine-us-data/cps_2023.h5" - ) + print(f"Loading dataset: {dataset}") + sim = Microsimulation(dataset=dataset) + + time_period = int(sim.default_calculation_period) + print(f"Derived time_period from dataset: {time_period}") - # Direct sum targets - these are regular variables that can be summed - # Store with their actual source year (2024 for hardcoded values from loss.py) - HARDCODED_YEAR = 2024 + # Direct sum targets - use the time_period derived from the dataset # Separate tax-related targets that need filer constraint tax_filer_targets = [ @@ -46,35 +55,35 @@ def extract_national_targets(): "value": 21.247e9, "source": "Joint Committee on Taxation", "notes": "SALT deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "medical_expense_deduction", "value": 11.4e9, "source": "Joint Committee on Taxation", "notes": "Medical expense deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "charitable_deduction", "value": 65.301e9, "source": "Joint Committee on Taxation", "notes": "Charitable deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "interest_deduction", "value": 24.8e9, "source": "Joint Committee on Taxation", "notes": "Mortgage interest deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "qualified_business_income_deduction", "value": 63.1e9, "source": "Joint Committee on Taxation", "notes": "QBI deduction tax expenditure", - "year": HARDCODED_YEAR, + "year": time_period, }, ] @@ -84,112 +93,112 @@ def extract_national_targets(): "value": 13e9, "source": "Survey-reported (post-TCJA grandfathered)", "notes": "Alimony received - survey reported, not tax-filer restricted", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "alimony_expense", "value": 13e9, "source": "Survey-reported (post-TCJA grandfathered)", "notes": "Alimony paid - survey reported, not tax-filer restricted", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "medicaid", "value": 871.7e9, "source": "https://www.cms.gov/files/document/highlights.pdf", "notes": "CMS 2023 highlights document - total Medicaid spending", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "net_worth", "value": 160e12, "source": "Federal Reserve SCF", "notes": "Total household net worth", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "health_insurance_premiums_without_medicare_part_b", "value": 385e9, "source": "MEPS/NHEA", "notes": "Health insurance premiums excluding Medicare Part B", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "other_medical_expenses", "value": 278e9, "source": "MEPS/NHEA", "notes": "Out-of-pocket medical expenses", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "medicare_part_b_premiums", "value": 112e9, "source": "CMS Medicare data", "notes": "Medicare Part B premium payments", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "over_the_counter_health_expenses", "value": 72e9, "source": "Consumer Expenditure Survey", "notes": "OTC health products and supplies", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "child_support_expense", "value": 33e9, "source": "Census Bureau", "notes": "Child support payments", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "child_support_received", "value": 33e9, "source": "Census Bureau", "notes": "Child support received", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "spm_unit_capped_work_childcare_expenses", "value": 348e9, "source": "Census Bureau SPM", "notes": "Work and childcare expenses for SPM", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "spm_unit_capped_housing_subsidy", "value": 35e9, "source": "HUD/Census", "notes": "Housing subsidies", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "tanf", "value": 9e9, "source": "HHS/ACF", "notes": "TANF cash assistance", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "real_estate_taxes", "value": 500e9, "source": "Census Bureau", "notes": "Property taxes paid", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "rent", "value": 735e9, "source": "Census Bureau/BLS", "notes": "Rental payments", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "tip_income", "value": 53.2e9, "source": "IRS Form W-2 Box 7 statistics", "notes": "Social security tips uprated 40% to account for underreporting", - "year": HARDCODED_YEAR, + "year": time_period, }, # SSA benefit-type totals derived from trust fund data and # SSA fact sheet type shares @@ -198,28 +207,28 @@ def extract_national_targets(): "value": 1_060e9, "source": "https://www.ssa.gov/OACT/STATS/table4a3.html", "notes": "~73% of total OASDI ($1,452B CBO projection)", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "social_security_disability", "value": 148e9, "source": "https://www.ssa.gov/OACT/STATS/table4a3.html", "notes": "~10.2% of total OASDI (disabled workers)", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "social_security_survivors", "value": 160e9, "source": "https://www.ssa.gov/OACT/FACTS/", "notes": "~11.0% of total OASDI (widows, children of deceased)", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "social_security_dependents", "value": 84e9, "source": "https://www.ssa.gov/OACT/FACTS/", "notes": "~5.8% of total OASDI (spouses/children of retired+disabled)", - "year": HARDCODED_YEAR, + "year": time_period, }, # IRA contribution totals from IRS SOI accumulation tables { @@ -227,14 +236,14 @@ def extract_national_targets(): "value": 25e9, "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", "notes": "Tax year 2022 (~5M x $4,510 avg) uprated ~12% to 2024", - "year": HARDCODED_YEAR, + "year": time_period, }, { "variable": "roth_ira_contributions", "value": 39e9, "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", "notes": "Tax year 2022 (~10M x $3,482 avg) uprated ~12% to 2024", - "year": HARDCODED_YEAR, + "year": time_period, }, ] @@ -247,7 +256,7 @@ def extract_national_targets(): "person_count": 72_429_055, "source": "CMS/HHS administrative data", "notes": "Medicaid enrollment count", - "year": HARDCODED_YEAR, + "year": time_period, }, { "constraint_variable": "aca_ptc", @@ -255,7 +264,7 @@ def extract_national_targets(): "person_count": 19_743_689, "source": "CMS marketplace data", "notes": "ACA Premium Tax Credit recipients", - "year": HARDCODED_YEAR, + "year": time_period, }, ] @@ -302,8 +311,7 @@ def extract_national_targets(): conditional_count_targets.extend(ssn_none_targets_by_year) - # CBO projection targets - get for a specific year - CBO_YEAR = 2023 # Year the CBO projections are for + # CBO projection targets - use time_period derived from dataset cbo_vars = [ # Note: income_tax_positive matches CBO's receipts definition # where refundable credit payments in excess of liability are @@ -326,7 +334,7 @@ def extract_national_targets(): param_name = cbo_param_name_map.get(variable_name, variable_name) try: value = sim.tax_benefit_system.parameters( - CBO_YEAR + time_period ).calibration.gov.cbo._children[param_name] cbo_targets.append( { @@ -334,7 +342,7 @@ def extract_national_targets(): "value": float(value), "source": "CBO Budget Projections", "notes": f"CBO projection for {variable_name}", - "year": CBO_YEAR, + "year": time_period, } ) except (KeyError, AttributeError) as e: @@ -343,11 +351,10 @@ def extract_national_targets(): f"{variable_name} (param: {param_name}): {e}" ) - # Treasury/JCT targets (EITC) - get for a specific year - TREASURY_YEAR = 2023 + # Treasury/JCT targets (EITC) - use time_period derived from dataset try: eitc_value = sim.tax_benefit_system.parameters.calibration.gov.treasury.tax_expenditures.eitc( - TREASURY_YEAR + time_period ) treasury_targets = [ { @@ -355,7 +362,7 @@ def extract_national_targets(): "value": float(eitc_value), "source": "Treasury/JCT Tax Expenditures", "notes": "EITC tax expenditure", - "year": TREASURY_YEAR, + "year": time_period, } ] except (KeyError, AttributeError) as e: @@ -368,6 +375,7 @@ def extract_national_targets(): "conditional_count_targets": conditional_count_targets, "cbo_targets": cbo_targets, "treasury_targets": treasury_targets, + "time_period": time_period, } @@ -707,10 +715,25 @@ def load_national_targets( def main(): """Main ETL pipeline for national targets.""" + parser = argparse.ArgumentParser( + description="ETL for national calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The time_period for targets is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() # Extract print("Extracting national targets...") - raw_targets = extract_national_targets() + raw_targets = extract_national_targets(dataset=args.dataset) + time_period = raw_targets["time_period"] + print(f"Using time_period={time_period} for CBO/Treasury targets") # Transform print("Transforming targets...") From 091f0e0a3811f6710f2e30cb49d0b1f2839aa410 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 2 Feb 2026 13:04:03 -0500 Subject: [PATCH 2/8] Use income_tax_positive for CBO calibration in loss.py The CBO income_tax parameter represents positive-only receipts (refundable credit payments in excess of liability are classified as outlays, not negative receipts). Using income_tax_positive matches this definition. Co-Authored-By: Claude Opus 4.5 --- changelog_entry.yaml | 2 +- policyengine_us_data/utils/loss.py | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index a2210db7e..6ea6b894d 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,3 +1,3 @@ - date: 2026-02-02 type: fixed - description: Fix stale 2022-2023 calibration targets in policy_data.db by deriving time_period from the dataset instead of hardcoding year constants + description: Fix stale calibration targets by deriving time_period from dataset and using income_tax_positive for CBO calibration diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index f798c0dc6..e9916641a 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -220,26 +220,35 @@ def build_loss_matrix(dataset: type, time_period): targets_array.append(populations[year]) # CBO projections + # Note: income_tax_positive matches CBO's receipts definition where + # refundable credit payments in excess of liability are classified as + # outlays, not negative receipts. See: https://www.cbo.gov/publication/43767 - PROGRAMS = [ - "income_tax", + CBO_PROGRAMS = [ + "income_tax_positive", "snap", "social_security", "ssi", "unemployment_compensation", ] - for variable_name in PROGRAMS: + # Mapping from variable name to CBO parameter name (when different) + CBO_PARAM_NAME_MAP = { + "income_tax_positive": "income_tax", + } + + for variable_name in CBO_PROGRAMS: label = f"nation/cbo/{variable_name}" loss_matrix[label] = sim.calculate( variable_name, map_to="household" ).values if any(loss_matrix[label].isna()): raise ValueError(f"Missing values for {label}") + param_name = CBO_PARAM_NAME_MAP.get(variable_name, variable_name) targets_array.append( sim.tax_benefit_system.parameters( time_period - ).calibration.gov.cbo._children[variable_name] + ).calibration.gov.cbo._children[param_name] ) # 1. Medicaid Spending From 8e658b2e35924f3264e136ccd5527c16fadb9488 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 2 Feb 2026 13:28:51 -0500 Subject: [PATCH 3/8] Add --dataset argument to all database ETL scripts All ETL scripts now derive their target year from the dataset's default_calculation_period instead of hardcoding years. This ensures all calibration targets stay synchronized when updating to a new base year annually. Updated scripts: - create_initial_strata.py - etl_age.py - etl_irs_soi.py (with configurable --lag for IRS data delay) - etl_medicaid.py - etl_snap.py - etl_state_income_tax.py Co-Authored-By: Claude Opus 4.5 --- changelog_entry.yaml | 2 +- .../db/create_initial_strata.py | 28 +++++++++++- policyengine_us_data/db/etl_age.py | 32 +++++++++++++- policyengine_us_data/db/etl_irs_soi.py | 43 +++++++++++++++++-- policyengine_us_data/db/etl_medicaid.py | 25 ++++++++++- policyengine_us_data/db/etl_snap.py | 25 ++++++++++- .../db/etl_state_income_tax.py | 25 ++++++++++- 7 files changed, 169 insertions(+), 11 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 6ea6b894d..4bbfcf6f0 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,3 +1,3 @@ - date: 2026-02-02 type: fixed - description: Fix stale calibration targets by deriving time_period from dataset and using income_tax_positive for CBO calibration + description: Fix stale calibration targets by deriving time_period from dataset across all ETL scripts and using income_tax_positive for CBO calibration diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py index f3edb1b41..8dda76e29 100644 --- a/policyengine_us_data/db/create_initial_strata.py +++ b/policyengine_us_data/db/create_initial_strata.py @@ -1,3 +1,4 @@ +import argparse import logging from typing import Dict @@ -6,6 +7,8 @@ from sqlmodel import Session, create_engine from policyengine_us_data.storage import STORAGE_FOLDER + +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -68,6 +71,28 @@ def fetch_congressional_districts(year): def main(): + parser = argparse.ArgumentParser( + description="Create initial geographic strata for calibration" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for Census API calls is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + + # Derive year from dataset + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") + # State FIPS to name/abbreviation mapping STATE_NAMES = { 1: "Alabama (AL)", @@ -123,8 +148,7 @@ def main(): 56: "Wyoming (WY)", } - # Fetch congressional district data for year 2023 - year = 2023 + # Fetch congressional district data cd_df = fetch_congressional_districts(year) DATABASE_URL = ( diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index 39ffedf22..13853ca44 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -1,9 +1,13 @@ +import argparse + import pandas as pd import numpy as np from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -279,10 +283,30 @@ def load_age_data(df_long, geo, year): session.commit() -if __name__ == "__main__": +def main(): + parser = argparse.ArgumentParser( + description="ETL for age calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for Census API calls is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + + # Derive year from dataset + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") # --- ETL: Extract, Transform, Load ---- - year = 2023 # ---- Extract ---------- docs = get_census_docs(year) @@ -301,3 +325,7 @@ def load_age_data(df_long, geo, year): load_age_data(long_national_df, "National", year) load_age_data(long_state_df, "State", year) load_age_data(long_district_df, "District", year) + + +if __name__ == "__main__": + main() diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index ed4da4e5c..873d7a072 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -1,3 +1,4 @@ +import argparse import logging from typing import Optional @@ -7,6 +8,11 @@ from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER + +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + +# IRS SOI data is typically available ~2 years after the tax year +IRS_SOI_LAG_YEARS = 2 from policyengine_us_data.utils.raw_cache import ( is_cached, cache_path, @@ -1207,9 +1213,40 @@ def load_soi_data(long_dfs, year): def main(): - # NOTE: predates the finalization of the 2020 Census redistricting - # and there is district mapping in the Transform step - year = 2022 + parser = argparse.ArgumentParser( + description="ETL for IRS SOI calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for IRS SOI data is derived from the dataset's " + "default_calculation_period minus IRS_SOI_LAG_YEARS. " + "Default: %(default)s" + ), + ) + parser.add_argument( + "--lag", + type=int, + default=IRS_SOI_LAG_YEARS, + help=( + "Years to subtract from dataset year for IRS SOI data " + "(default: %(default)s, since IRS data is ~2 years behind)" + ), + ) + args = parser.parse_args() + + # Derive year from dataset with lag applied + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + dataset_year = int(sim.default_calculation_period) + year = dataset_year - args.lag + print( + f"Dataset year: {dataset_year}, IRS SOI year: {year} (lag={args.lag})" + ) # Extract ----------------------- raw_df = extract_soi_data() diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py index ed1841447..435ccd42c 100644 --- a/policyengine_us_data/db/etl_medicaid.py +++ b/policyengine_us_data/db/etl_medicaid.py @@ -1,3 +1,4 @@ +import argparse import logging import requests @@ -7,6 +8,8 @@ from policyengine_us_data.storage import STORAGE_FOLDER +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -325,7 +328,27 @@ def load_medicaid_data(long_state, long_cd, year): def main(): - year = 2024 + parser = argparse.ArgumentParser( + description="ETL for Medicaid calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for targets is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + + # Derive year from dataset + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") # Extract ------------------------------ state_admin_df = extract_administrative_medicaid_data(year) diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index 48c1eb832..a8a80f5ce 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -1,3 +1,4 @@ +import argparse import logging import requests import zipfile @@ -10,6 +11,8 @@ from policyengine_us_data.storage import STORAGE_FOLDER +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -363,7 +366,27 @@ def load_survey_snap_data(survey_df, year, snap_stratum_lookup): def main(): - year = 2023 + parser = argparse.ArgumentParser( + description="ETL for SNAP calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for targets is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + + # Derive year from dataset + from policyengine_us import Microsimulation + + print(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + print(f"Derived year from dataset: {year}") # Extract --------- zip_file_admin = extract_administrative_snap_data() diff --git a/policyengine_us_data/db/etl_state_income_tax.py b/policyengine_us_data/db/etl_state_income_tax.py index df0f40a6c..9da8d8390 100644 --- a/policyengine_us_data/db/etl_state_income_tax.py +++ b/policyengine_us_data/db/etl_state_income_tax.py @@ -10,12 +10,15 @@ Stratum Group ID: 7 (State Income Tax) """ +import argparse import logging import pandas as pd import numpy as np from sqlmodel import Session, create_engine, select from policyengine_us_data.storage import STORAGE_FOLDER + +DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/calibration/stratified_extended_cps.h5" from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -342,12 +345,32 @@ def load_state_income_tax_data(df: pd.DataFrame, year: int) -> dict: def main(): """Run the full ETL pipeline for state income tax targets.""" + parser = argparse.ArgumentParser( + description="ETL for state income tax calibration targets" + ) + parser.add_argument( + "--dataset", + default=DEFAULT_DATASET, + help=( + "Source dataset (local path or HuggingFace URL). " + "The year for targets is derived from the dataset's " + "default_calculation_period. Default: %(default)s" + ), + ) + args = parser.parse_args() + logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) - year = 2023 + # Derive year from dataset + from policyengine_us import Microsimulation + + logger.info(f"Loading dataset: {args.dataset}") + sim = Microsimulation(dataset=args.dataset) + year = int(sim.default_calculation_period) + logger.info(f"Derived year from dataset: {year}") logger.info(f"Extracting Census STC data for FY{year}...") raw_df = extract_state_income_tax_data(year) From 9735cf0793b9de8157c85eafa5f4c5e876524756 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 2 Feb 2026 14:54:40 -0500 Subject: [PATCH 4/8] Add 119th Congress district code support for 2024 ACS data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update parse_ucgid to recognize both 5001800US (118th) and 5001900US (119th Congress) - Expand Puerto Rico and territory filters to handle both Congress code formats - Update TERRITORY_UCGIDS and NON_VOTING_GEO_IDS with 119th Congress codes This ensures consistent redistricting alignment: 2024 ACS data uses 119th Congress codes natively, and IRS SOI data is converted via the 116th→119th mapping matrix. Co-Authored-By: Claude Haiku 4.5 --- changelog_entry.yaml | 2 +- policyengine_us_data/db/etl_age.py | 7 +++++-- policyengine_us_data/db/etl_snap.py | 3 ++- .../storage/calibration_targets/pull_soi_targets.py | 7 +++++++ policyengine_us_data/utils/census.py | 7 +++++++ policyengine_us_data/utils/db.py | 5 ++++- 6 files changed, 26 insertions(+), 5 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 4bbfcf6f0..1d930f19e 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,3 +1,3 @@ - date: 2026-02-02 type: fixed - description: Fix stale calibration targets by deriving time_period from dataset across all ETL scripts and using income_tax_positive for CBO calibration + description: Fix stale calibration targets by deriving time_period from dataset across all ETL scripts, using income_tax_positive for CBO calibration, and adding 119th Congress district code support for consistent redistricting alignment diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index 13853ca44..2e213d92b 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -70,9 +70,12 @@ def transform_age_data(age_data, docs): df = df.rename({"GEO_ID": "ucgid_str"}, axis=1) df_data = df.rename(columns=rename_mapping)[["ucgid_str"] + list(AGE_COLS)] - # Filter out Puerto Rico's district and state records, if needed + # Filter out Puerto Rico's district and state records + # 5001800US7298 = 118th Congress, 5001900US7298 = 119th Congress df_geos = df_data[ - ~df_data["ucgid_str"].isin(["5001800US7298", "0400000US72"]) + ~df_data["ucgid_str"].isin( + ["5001800US7298", "5001900US7298", "0400000US72"] + ) ].copy() df = df_geos[["ucgid_str"] + AGE_COLS] diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index a8a80f5ce..554f67ec1 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -152,9 +152,10 @@ def transform_survey_snap_data(raw_df): {"GEO_ID": "ucgid_str", "S2201_C03_001E": "snap_household_ct"}, axis=1 )[ ~df["GEO_ID"].isin( - [ # Puerto Rico's state and district + [ # Puerto Rico's state and district (118th and 119th Congress) "0400000US72", "5001800US7298", + "5001900US7298", ] ) ] diff --git a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py index 0b1f3dcb6..c3f159191 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_soi_targets.py @@ -41,11 +41,18 @@ NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"} NON_VOTING_GEO_IDS = { "0400000US72", # Puerto Rico (state level) + # 118th Congress codes "5001800US7298", # Puerto Rico "5001800US6098", # American Samoa "5001800US6698", # Guam "5001800US6998", # Northern Mariana Islands "5001800US7898", # U.S. Virgin Islands + # 119th Congress codes + "5001900US7298", # Puerto Rico + "5001900US6098", # American Samoa + "5001900US6698", # Guam + "5001900US6998", # Northern Mariana Islands + "5001900US7898", # U.S. Virgin Islands } # after skipping the first 7 rows, the national SOI file has targets as row indices [COUNT_INDEX, AMOUNT_INDEX] diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py index cb9d0b5d8..c61cc166d 100644 --- a/policyengine_us_data/utils/census.py +++ b/policyengine_us_data/utils/census.py @@ -123,11 +123,18 @@ TERRITORY_UCGIDS = { "0400000US72", # Puerto Rico (state level) + # 118th Congress codes "5001800US7298", # Puerto Rico "5001800US6098", # American Samoa "5001800US6698", # Guam "5001800US6998", # Northern Mariana Islands "5001800US7898", # U.S. Virgin Islands + # 119th Congress codes + "5001900US7298", # Puerto Rico + "5001900US6098", # American Samoa + "5001900US6698", # Guam + "5001900US6998", # Northern Mariana Islands + "5001900US7898", # U.S. Virgin Islands } diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py index 6c7b1a4ed..4de79c44f 100644 --- a/policyengine_us_data/utils/db.py +++ b/policyengine_us_data/utils/db.py @@ -82,7 +82,10 @@ def parse_ucgid(ucgid_str: str) -> Dict: elif ucgid_str.startswith("0400000US"): state_fips = int(ucgid_str[9:]) return {"type": "state", "state_fips": state_fips} - elif ucgid_str.startswith("5001800US"): + elif ucgid_str.startswith("5001800US") or ucgid_str.startswith( + "5001900US" + ): + # 5001800US = 118th Congress, 5001900US = 119th Congress state_and_district = ucgid_str[9:] state_fips = int(state_and_district[:2]) district_number = int(state_and_district[2:]) From e5ecbacae43c1e695224e172bbef50d9b2bbf19f Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Wed, 4 Feb 2026 16:25:06 -0500 Subject: [PATCH 5/8] Remove seed-related changes to reduce PR scope Revert deterministic hash-based medicaid/SSI seed logic in cps.py, update Makefile seed to 3526. Co-Authored-By: Claude Opus 4.5 --- Makefile | 2 +- .../calibration_utils.py | 49 +++++++------------ 2 files changed, 18 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index c538a7bc9..09d984a96 100644 --- a/Makefile +++ b/Makefile @@ -87,7 +87,7 @@ data: download python policyengine_us_data/datasets/cps/extended_cps.py python policyengine_us_data/datasets/cps/enhanced_cps.py python policyengine_us_data/datasets/cps/small_enhanced_cps.py - python policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py 10500 + python policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py 12000 --top=99.5 --seed=3526 publish-local-area: python policyengine_us_data/datasets/cps/local_area_calibration/publish_local_area.py diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py index aa954aba1..c27cf3e04 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py @@ -252,39 +252,24 @@ def get_pseudo_input_variables(sim) -> set: """ Identify pseudo-input variables that should NOT be saved to H5 files. - A pseudo-input is a variable that: - - Appears in sim.input_variables (has stored values) - - Has 'adds' or 'subtracts' attribute - - At least one component has a formula (is calculated) - - These variables have stale pre-computed values that corrupt calculations - when reloaded, because the stored value overrides the formula. + NOTE: This function currently returns an empty set. The original logic + excluded variables with 'adds' or 'subtracts' attributes, but analysis + showed that in CPS data, these variables contain authoritative stored + data that does NOT match their component variables: + + - pre_tax_contributions: components are all 0, aggregate has imputed values + - tax_exempt_pension_income: aggregate has 135M, components only 20M + - taxable_pension_income: aggregate has 82M, components only 29M + - interest_deduction: aggregate has 41M, components are 0 + + The 'adds' attribute defines how to CALCULATE these values, but in CPS + data the stored values are the authoritative source. Excluding them and + recalculating from components produces incorrect results. + + For geo-stacking, entity ID reindexing preserves within-entity + relationships, so aggregation within a person or tax_unit remains valid. """ - tbs = sim.tax_benefit_system - pseudo_inputs = set() - - for var_name in sim.input_variables: - var = tbs.variables.get(var_name) - if not var: - continue - - adds = getattr(var, "adds", None) - if adds and isinstance(adds, list): - for component in adds: - comp_var = tbs.variables.get(component) - if comp_var and len(getattr(comp_var, "formulas", {})) > 0: - pseudo_inputs.add(var_name) - break - - subtracts = getattr(var, "subtracts", None) - if subtracts and isinstance(subtracts, list): - for component in subtracts: - comp_var = tbs.variables.get(component) - if comp_var and len(getattr(comp_var, "formulas", {})) > 0: - pseudo_inputs.add(var_name) - break - - return pseudo_inputs + return set() def apply_op(values: np.ndarray, op: str, val: str) -> np.ndarray: From 8c4484820f17e0aeb53d96547972461bbb9b71f0 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Wed, 4 Feb 2026 17:20:06 -0500 Subject: [PATCH 6/8] Upgrade policyengine-us to 1.550.1 in uv.lock Needed for income_tax_positive variable used in loss.py. Co-Authored-By: Claude Opus 4.5 --- uv.lock | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index 4c2966f29..95f8747f8 100644 --- a/uv.lock +++ b/uv.lock @@ -637,6 +637,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" }, { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" }, { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" }, + { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" }, { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" }, { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" }, { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" }, @@ -644,6 +645,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/2f/28592176381b9ab2cafa12829ba7b472d177f3acc35d8fbcf3673d966fff/greenlet-3.3.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a1e41a81c7e2825822f4e068c48cb2196002362619e2d70b148f20a831c00739", size = 275140, upload-time = "2025-12-04T14:23:01.282Z" }, { url = "https://files.pythonhosted.org/packages/2c/80/fbe937bf81e9fca98c981fe499e59a3f45df2a04da0baa5c2be0dca0d329/greenlet-3.3.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f515a47d02da4d30caaa85b69474cec77b7929b2e936ff7fb853d42f4bf8808", size = 599219, upload-time = "2025-12-04T14:50:08.309Z" }, { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" }, + { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" }, { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" }, { url = "https://files.pythonhosted.org/packages/b5/ba/56699ff9b7c76ca12f1cdc27a886d0f81f2189c3455ff9f65246780f713d/greenlet-3.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ab97cf74045343f6c60a39913fa59710e4bd26a536ce7ab2397adf8b27e67c39", size = 1567256, upload-time = "2025-12-04T15:04:25.276Z" }, { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" }, @@ -1842,7 +1844,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.524.1" +version = "1.550.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -1850,9 +1852,9 @@ dependencies = [ { name = "policyengine-core" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/2b/f0/623a10a84d501039b100808f3f42020ae68c5a793ff2865c58a034be1b32/policyengine_us-1.524.1.tar.gz", hash = "sha256:329ab166681264fcacbe4549afa023cf96bda2348d48bfc385e8328afc811ec8", size = 8503673, upload-time = "2026-01-26T20:47:54.189Z" } +sdist = { url = "https://files.pythonhosted.org/packages/03/a7/031dfe91081446203cf90203ca3305b09a93d495852df18a5177d3784bce/policyengine_us-1.550.1.tar.gz", hash = "sha256:7c0900d5f09ceadcc2047a2f7a4f84e23e30f461653a358ed49c0eb1f1337a66", size = 8606705, upload-time = "2026-02-04T19:52:56.341Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a2/08/c53fd0555b2fa95cb81994ae20f4fb9df1d35b14efad26f911ff6d0ead75/policyengine_us-1.524.1-py3-none-any.whl", hash = "sha256:e940b977090a2aea782724668d798ddf667b1b1bda7036e93f3caeaf853ea436", size = 7398760, upload-time = "2026-01-26T20:47:51.851Z" }, + { url = "https://files.pythonhosted.org/packages/ad/52/cceeb4ee203addcf049c7443fa53455a66962b650e09e3f82e36d65b427e/policyengine_us-1.550.1-py3-none-any.whl", hash = "sha256:9e1fd0c709502ff69b6051ecd7ed5fea49bd01629790867168651f4a117cb703", size = 7600766, upload-time = "2026-02-04T19:52:53.92Z" }, ] [[package]] From c54ae1c4d006d5e647209ec25b6ff5103a324d39 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Sat, 7 Feb 2026 15:26:34 -0500 Subject: [PATCH 7/8] Cherry-pick ACA PTC targets from PR #508 and update changelog Adds aca_ptc ingestion from IRS SOI data (code 85530) to etl_irs_soi.py and updates DATABASE_GUIDE.md to reflect stratum_group_id 119. Co-Authored-By: Claude Opus 4.6 --- changelog_entry.yaml | 3 +++ policyengine_us_data/db/DATABASE_GUIDE.md | 2 +- policyengine_us_data/db/etl_irs_soi.py | 23 +++++++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 1d930f19e..c69b45735 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,3 +1,6 @@ - date: 2026-02-02 type: fixed description: Fix stale calibration targets by deriving time_period from dataset across all ETL scripts, using income_tax_positive for CBO calibration, and adding 119th Congress district code support for consistent redistricting alignment +- date: 2026-02-07 + type: added + description: Add ACA Premium Tax Credit targets from IRS SOI data (cherry-picked from PR #508) diff --git a/policyengine_us_data/db/DATABASE_GUIDE.md b/policyengine_us_data/db/DATABASE_GUIDE.md index ac038cb7e..a9f8f3989 100644 --- a/policyengine_us_data/db/DATABASE_GUIDE.md +++ b/policyengine_us_data/db/DATABASE_GUIDE.md @@ -110,7 +110,7 @@ The `stratum_group_id` field categorizes strata: | 5 | Medicaid | Medicaid enrollment strata | | 6 | EITC | EITC recipients by qualifying children | | 7 | State Income Tax | State-level income tax collections (Census STC) | -| 100-118 | IRS Conditional | Each IRS variable paired with conditional count constraints | +| 100-119 | IRS Conditional | Each IRS variable paired with conditional count constraints (includes ACA PTC at 119) | ### Conditional Strata (IRS SOI) diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index 873d7a072..5f191ce3f 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -287,6 +287,7 @@ def transform_soi_data(raw_df): dict(code="18425", name="salt", breakdown=None), dict(code="06500", name="income_tax", breakdown=None), dict(code="05800", name="income_tax_before_credits", breakdown=None), + dict(code="85530", name="aca_ptc", breakdown=None), ] # National --------------- @@ -572,6 +573,28 @@ def load_soi_data(long_dfs, year): units="dollars", ) + # ACA Premium Tax Credit + ptc_group = get_or_create_variable_group( + session, + name="aca_ptc_recipients", + category="tax", + is_histogram=False, + is_exclusive=False, + aggregation_method="sum", + display_order=9, + description="ACA Premium Tax Credit recipients and amounts", + ) + + get_or_create_variable_metadata( + session, + variable="aca_ptc", + group=ptc_group, + display_name="Premium Tax Credit", + display_order=1, + units="dollars", + notes="ACA Premium Tax Credit amount from IRS SOI", + ) + # Fetch existing geographic strata geo_strata = get_geographic_strata(session) From 9a7a81be44cd6f8f6c27a0ff9c14ebc4de483518 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Sat, 7 Feb 2026 19:04:42 -0500 Subject: [PATCH 8/8] Split local area publish into build+stage and promote phases Prevents silent no-op promotes by detecting when HF commits don't change HEAD. Adds separate promote workflow for manual gate before pushing staging files to production. Also bumps calibration epochs from 200 to 250. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/local_area_promote.yaml | 43 +++++++ .github/workflows/local_area_publish.yaml | 12 +- modal_app/local_area.py | 114 +++++++++++++++--- .../datasets/cps/enhanced_cps.py | 2 +- policyengine_us_data/utils/data_upload.py | 34 +++++- 5 files changed, 184 insertions(+), 21 deletions(-) create mode 100644 .github/workflows/local_area_promote.yaml diff --git a/.github/workflows/local_area_promote.yaml b/.github/workflows/local_area_promote.yaml new file mode 100644 index 000000000..8d7d235ea --- /dev/null +++ b/.github/workflows/local_area_promote.yaml @@ -0,0 +1,43 @@ +name: Promote Local Area H5 Files + +on: + workflow_dispatch: + inputs: + version: + description: 'Version to promote (e.g. 1.23.0)' + required: true + type: string + branch: + description: 'Branch to use for repo setup' + required: false + default: 'main' + type: string + +jobs: + promote-local-area: + runs-on: ubuntu-latest + permissions: + contents: read + env: + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + + - name: Install Modal CLI + run: pip install modal + + - name: Promote staged files to production + run: | + VERSION="${{ github.event.inputs.version }}" + BRANCH="${{ github.event.inputs.branch }}" + echo "Promoting version ${VERSION} from branch ${BRANCH}" + modal run modal_app/local_area.py::main_promote --version="${VERSION}" --branch="${BRANCH}" diff --git a/.github/workflows/local_area_publish.yaml b/.github/workflows/local_area_publish.yaml index 7e756ad85..44675e63e 100644 --- a/.github/workflows/local_area_publish.yaml +++ b/.github/workflows/local_area_publish.yaml @@ -49,7 +49,7 @@ jobs: - name: Install Modal CLI run: pip install modal - - name: Run local area publishing on Modal + - name: Run local area build and stage on Modal run: | NUM_WORKERS="${{ github.event.inputs.num_workers || '8' }}" SKIP_UPLOAD="${{ github.event.inputs.skip_upload || 'false' }}" @@ -63,3 +63,13 @@ jobs: echo "Running: $CMD" $CMD + + - name: Post-build summary + if: success() + run: | + echo "## Build + Stage Complete" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "Files have been uploaded to GCS and staged on HuggingFace." >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Next step: Promote to production" >> $GITHUB_STEP_SUMMARY + echo "Trigger the **Promote Local Area H5 Files** workflow with the version from the build output." >> $GITHUB_STEP_SUMMARY diff --git a/modal_app/local_area.py b/modal_app/local_area.py index d10273d5b..92e068335 100644 --- a/modal_app/local_area.py +++ b/modal_app/local_area.py @@ -259,14 +259,11 @@ def validate_staging(branch: str, version: str) -> Dict: memory=8192, timeout=14400, ) -def atomic_upload(branch: str, version: str, manifest: Dict) -> str: +def upload_to_staging(branch: str, version: str, manifest: Dict) -> str: """ - Upload files using staging approach for atomic deployment. + Upload files to GCS (production) and HuggingFace (staging only). - 1. Upload to GCS (direct, overwrites existing) - 2. Upload to HuggingFace staging/ folder - 3. Atomically promote staging/ to production paths - 4. Clean up staging/ + Promote must be run separately via promote_publish. """ setup_gcp_credentials() setup_repo(branch) @@ -286,8 +283,6 @@ def atomic_upload(branch: str, version: str, manifest: Dict) -> str: from policyengine_us_data.utils.data_upload import ( upload_local_area_file, upload_to_staging_hf, - promote_staging_to_production_hf, - cleanup_staging_hf, ) manifest = json.loads('''{manifest_json}''') @@ -306,11 +301,9 @@ def atomic_upload(branch: str, version: str, manifest: Dict) -> str: print(f"Verified {{verification['verified']}} files") files_with_paths = [] -rel_paths = [] for rel_path in manifest["files"].keys(): local_path = version_dir / rel_path files_with_paths.append((local_path, rel_path)) - rel_paths.append(rel_path) # Upload to GCS (direct to production paths) print(f"Uploading {{len(files_with_paths)}} files to GCS...") @@ -331,12 +324,73 @@ def atomic_upload(branch: str, version: str, manifest: Dict) -> str: hf_count = upload_to_staging_hf(files_with_paths, version) print(f"Uploaded {{hf_count}} files to HuggingFace staging/") -# Atomically promote staging to production -print("Promoting staging/ to production (atomic commit)...") +print(f"Staged version {{version}} for promotion") +""", + ], + text=True, + env=os.environ.copy(), + ) + + if result.returncode != 0: + raise RuntimeError(f"Upload failed: {result.stderr}") + + return ( + f"Staged version {version} with {len(manifest['files'])} files. " + f"Run promote workflow to publish to HuggingFace production." + ) + + +@app.function( + image=image, + secrets=[hf_secret], + volumes={VOLUME_MOUNT: staging_volume}, + memory=4096, + timeout=3600, +) +def promote_publish(branch: str = "main", version: str = "") -> str: + """ + Promote staged files from HF staging/ to production paths, then cleanup. + + Reads the manifest from the Modal staging volume to determine which + files to promote. + """ + setup_repo(branch) + + staging_dir = Path(VOLUME_MOUNT) + staging_volume.reload() + + manifest_path = staging_dir / version / "manifest.json" + if not manifest_path.exists(): + raise RuntimeError( + f"No manifest found at {manifest_path}. " + f"Run build+stage workflow first." + ) + + with open(manifest_path) as f: + manifest = json.load(f) + + rel_paths_json = json.dumps(list(manifest["files"].keys())) + + result = subprocess.run( + [ + "uv", + "run", + "python", + "-c", + f""" +import json +from policyengine_us_data.utils.data_upload import ( + promote_staging_to_production_hf, + cleanup_staging_hf, +) + +rel_paths = json.loads('''{rel_paths_json}''') +version = "{version}" + +print(f"Promoting {{len(rel_paths)}} files from staging/ to production...") promoted = promote_staging_to_production_hf(rel_paths, version) print(f"Promoted {{promoted}} files to production") -# Clean up staging print("Cleaning up staging/...") cleaned = cleanup_staging_hf(rel_paths, version) print(f"Cleaned up {{cleaned}} files from staging/") @@ -349,9 +403,9 @@ def atomic_upload(branch: str, version: str, manifest: Dict) -> str: ) if result.returncode != 0: - raise RuntimeError(f"Upload failed: {result.stderr}") + raise RuntimeError(f"Promote failed: {result.stderr}") - return f"Successfully published version {version} with {len(manifest['files'])} files" + return f"Successfully promoted version {version} with {len(manifest['files'])} files" @app.function( @@ -544,10 +598,24 @@ def coordinate_publish( f"WARNING: Expected {expected_total} files, found {actual_total}" ) - print("\nStarting atomic upload...") - result = atomic_upload.remote( + print("\nStarting upload to staging...") + result = upload_to_staging.remote( branch=branch, version=version, manifest=manifest ) + print(result) + + print("\n" + "=" * 60) + print("BUILD + STAGE COMPLETE") + print("=" * 60) + print( + f"To promote to HuggingFace production, run the " + f"'Promote Local Area H5 Files' workflow with version={version}" + ) + print( + "Or run manually: modal run modal_app/local_area.py::main_promote " + f"--version={version}" + ) + print("=" * 60) return result @@ -565,3 +633,15 @@ def main( skip_upload=skip_upload, ) print(result) + + +@app.local_entrypoint() +def main_promote( + version: str = "", + branch: str = "main", +): + """Promote staged files to HuggingFace production.""" + if not version: + raise ValueError("--version is required") + result = promote_publish.remote(branch=branch, version=version) + print(result) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 9799e99ac..385ec1e97 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -196,7 +196,7 @@ def generate(self): loss_matrix_clean, targets_array_clean, log_path="calibration_log.csv", - epochs=200, + epochs=250, seed=1456, ) data["household_weight"][year] = optimised_weights diff --git a/policyengine_us_data/utils/data_upload.py b/policyengine_us_data/utils/data_upload.py index a94ca1a13..42cd8feee 100644 --- a/policyengine_us_data/utils/data_upload.py +++ b/policyengine_us_data/utils/data_upload.py @@ -362,6 +362,9 @@ def promote_staging_to_production_hf( Returns: Number of files promoted + + Raises: + RuntimeError: If the commit was a no-op (HEAD unchanged) """ token = os.environ.get("HUGGING_FACE_TOKEN") api = HfApi() @@ -380,7 +383,13 @@ def promote_staging_to_production_hf( logging.warning("No files to promote.") return 0 - hf_create_commit_with_retry( + head_before = api.repo_info( + repo_id=hf_repo_name, + repo_type=hf_repo_type, + token=token, + ).sha + + result = hf_create_commit_with_retry( api=api, operations=operations, repo_id=hf_repo_name, @@ -389,6 +398,12 @@ def promote_staging_to_production_hf( commit_message=f"Promote {len(files)} files from staging to production for version {version}", ) + if result.oid == head_before: + raise RuntimeError( + f"Promote commit was a no-op: HEAD stayed at {head_before}. " + f"Staging files may be identical to production." + ) + logging.info( f"Promoted {len(files)} files from staging/ to production in one commit" ) @@ -412,6 +427,9 @@ def cleanup_staging_hf( Returns: Number of files deleted + + Raises: + RuntimeError: If the cleanup commit was a no-op (HEAD unchanged) """ token = os.environ.get("HUGGING_FACE_TOKEN") api = HfApi() @@ -424,7 +442,13 @@ def cleanup_staging_hf( if not operations: return 0 - hf_create_commit_with_retry( + head_before = api.repo_info( + repo_id=hf_repo_name, + repo_type=hf_repo_type, + token=token, + ).sha + + result = hf_create_commit_with_retry( api=api, operations=operations, repo_id=hf_repo_name, @@ -433,5 +457,11 @@ def cleanup_staging_hf( commit_message=f"Clean up staging after version {version} promotion", ) + if result.oid == head_before: + raise RuntimeError( + f"Cleanup commit was a no-op: HEAD stayed at {head_before}. " + f"Staging files may not exist." + ) + logging.info(f"Cleaned up {len(files)} files from staging/") return len(files)