From 9582ecfab65a7e0cec797b8ab32720797f40baec Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Thu, 4 Dec 2025 16:08:39 -0500 Subject: [PATCH 01/10] Use spm-calculator to compute SPM thresholds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add spm-calculator as a dependency (from GitHub) - Create policyengine_us_data/utils/spm.py with threshold calculation utilities - Update CPS dataset to use spm-calculator with Census-provided GEOADJ factors - Update ACS dataset to use spm-calculator with national-level thresholds For CPS: Uses the SPM_GEOADJ values from Census data combined with spm-calculator's base thresholds and equivalence scale, providing geographically-adjusted thresholds. For ACS: Uses national-level thresholds (no geographic adjustment) since ACS doesn't include pre-computed GEOADJ values. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- policyengine_us_data/datasets/acs/acs.py | 51 +++++++++- policyengine_us_data/datasets/cps/cps.py | 23 ++++- policyengine_us_data/utils/spm.py | 122 +++++++++++++++++++++++ pyproject.toml | 1 + 4 files changed, 191 insertions(+), 6 deletions(-) create mode 100644 policyengine_us_data/utils/spm.py diff --git a/policyengine_us_data/datasets/acs/acs.py b/policyengine_us_data/datasets/acs/acs.py index 0ecd3ee7..836c2a1f 100644 --- a/policyengine_us_data/datasets/acs/acs.py +++ b/policyengine_us_data/datasets/acs/acs.py @@ -25,6 +25,7 @@ def generate(self) -> None: self.add_id_variables(acs, person, household) self.add_person_variables(acs, person, household) self.add_household_variables(acs, household) + self.add_spm_variables(acs, person, household, self.time_period) acs.close() raw_data.close() @@ -93,9 +94,53 @@ def add_person_variables( ) @staticmethod - def add_spm_variables(acs: h5py.File, spm_unit: DataFrame) -> None: - acs["spm_unit_net_income_reported"] = spm_unit.SPM_RESOURCES - acs["spm_unit_spm_threshold"] = spm_unit.SPM_POVTHRESHOLD + def add_spm_variables( + acs: h5py.File, + person: DataFrame, + household: DataFrame, + time_period: int, + ) -> None: + from policyengine_us_data.utils.spm import ( + calculate_spm_thresholds_national, + map_tenure_acs_to_spm, + ) + + # In ACS, SPM unit = household + # Calculate number of adults (18+) and children (<18) per household + person_with_hh = person.copy() + person_with_hh["is_adult"] = person_with_hh["AGEP"] >= 18 + person_with_hh["is_child"] = person_with_hh["AGEP"] < 18 + + hh_counts = ( + person_with_hh.groupby("household_id") + .agg({"is_adult": "sum", "is_child": "sum"}) + .rename( + columns={"is_adult": "num_adults", "is_child": "num_children"} + ) + ) + + # Ensure household is indexed properly + household_indexed = household.set_index("household_id") + + # Get counts aligned with household order + num_adults = hh_counts.loc[ + household_indexed.index, "num_adults" + ].values + num_children = hh_counts.loc[ + household_indexed.index, "num_children" + ].values + + # Map ACS tenure to SPM tenure codes + tenure_codes = map_tenure_acs_to_spm(household_indexed["TEN"].values) + + # Calculate SPM thresholds using national-level values + # (ACS doesn't have Census-provided geographic adjustments) + acs["spm_unit_spm_threshold"] = calculate_spm_thresholds_national( + num_adults=num_adults, + num_children=num_children, + tenure_codes=tenure_codes, + year=time_period, + ) @staticmethod def add_household_variables(acs: h5py.File, household: DataFrame) -> None: diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index f932e0d5..90fc72c3 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -78,7 +78,7 @@ def generate(self): undocumented_students_target=0.21 * 1.9e6, ) logging.info("Adding family variables") - add_spm_variables(cps, spm_unit) + add_spm_variables(cps, spm_unit, self.time_period) logging.info("Adding household variables") add_household_variables(cps, household) logging.info("Adding rent") @@ -602,7 +602,15 @@ def add_personal_income_variables( cps[f"{var}_would_be_qualified"] = rng.random(len(person)) < prob -def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None: +def add_spm_variables( + cps: h5py.File, + spm_unit: DataFrame, + time_period: int, +) -> None: + from policyengine_us_data.utils.spm import ( + calculate_spm_thresholds_with_geoadj, + ) + SPM_RENAMES = dict( spm_unit_total_income_reported="SPM_TOTVAL", snap_reported="SPM_SNAPSUB", @@ -616,7 +624,6 @@ def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None: # State tax includes refundable credits. spm_unit_state_tax_reported="SPM_STTAX", spm_unit_capped_work_childcare_expenses="SPM_CAPWKCCXPNS", - spm_unit_spm_threshold="SPM_POVTHRESHOLD", spm_unit_net_income_reported="SPM_RESOURCES", spm_unit_pre_subsidy_childcare_expenses="SPM_CHILDCAREXPNS", ) @@ -625,6 +632,16 @@ def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None: if asec_variable in spm_unit.columns: cps[openfisca_variable] = spm_unit[asec_variable] + # Calculate SPM thresholds using spm-calculator with Census-provided + # geographic adjustment factors (SPM_GEOADJ) + cps["spm_unit_spm_threshold"] = calculate_spm_thresholds_with_geoadj( + num_adults=spm_unit["SPM_NUMADULTS"].values, + num_children=spm_unit["SPM_NUMKIDS"].values, + tenure_codes=spm_unit["SPM_TENMORTSTATUS"].values, + geoadj=spm_unit["SPM_GEOADJ"].values, + year=time_period, + ) + cps["reduced_price_school_meals_reported"] = ( cps["free_school_meals_reported"] * 0 ) diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py new file mode 100644 index 00000000..05c04234 --- /dev/null +++ b/policyengine_us_data/utils/spm.py @@ -0,0 +1,122 @@ +"""SPM threshold calculation utilities using the spm-calculator package.""" + +import numpy as np +from spm_calculator import SPMCalculator, spm_equivalence_scale + + +# Census CPS SPM_TENMORTSTATUS codes to spm-calculator tenure mapping +# Based on IPUMS SPMMORT documentation: +# 1 = Owner with mortgage +# 2 = Owner without mortgage +# 3 = Renter +TENURE_CODE_MAP = { + 1: "owner_with_mortgage", + 2: "owner_without_mortgage", + 3: "renter", +} + + +def calculate_spm_thresholds_with_geoadj( + num_adults: np.ndarray, + num_children: np.ndarray, + tenure_codes: np.ndarray, + geoadj: np.ndarray, + year: int, +) -> np.ndarray: + """ + Calculate SPM thresholds using Census-provided geographic adjustments. + + This function uses the SPM_GEOADJ values already computed by the Census + Bureau, combined with spm-calculator's base thresholds and equivalence + scale formula. This avoids the need for a Census API key. + + Args: + num_adults: Array of number of adults (18+) in each SPM unit. + num_children: Array of number of children (<18) in each SPM unit. + tenure_codes: Array of Census tenure/mortgage status codes. + 1 = owner with mortgage, 2 = owner without mortgage, 3 = renter. + geoadj: Array of Census SPM_GEOADJ geographic adjustment factors. + year: The year for which to calculate thresholds. + + Returns: + Array of SPM threshold values. + """ + calc = SPMCalculator(year=year) + base_thresholds = calc.get_base_thresholds() + + n = len(num_adults) + thresholds = np.zeros(n) + + for i in range(n): + tenure_str = TENURE_CODE_MAP.get(int(tenure_codes[i]), "renter") + base = base_thresholds[tenure_str] + equiv_scale = spm_equivalence_scale( + int(num_adults[i]), int(num_children[i]) + ) + thresholds[i] = base * equiv_scale * geoadj[i] + + return thresholds + + +def calculate_spm_thresholds_national( + num_adults: np.ndarray, + num_children: np.ndarray, + tenure_codes: np.ndarray, + year: int, +) -> np.ndarray: + """ + Calculate SPM thresholds using national-level thresholds (no geoadj). + + This is used for datasets like ACS that don't have pre-computed + geographic adjustment factors. + + Args: + num_adults: Array of number of adults (18+) in each SPM unit. + num_children: Array of number of children (<18) in each SPM unit. + tenure_codes: Array of Census tenure/mortgage status codes. + 1 = owner with mortgage, 2 = owner without mortgage, 3 = renter. + year: The year for which to calculate thresholds. + + Returns: + Array of SPM threshold values using national averages. + """ + calc = SPMCalculator(year=year) + base_thresholds = calc.get_base_thresholds() + + n = len(num_adults) + thresholds = np.zeros(n) + + for i in range(n): + tenure_str = TENURE_CODE_MAP.get(int(tenure_codes[i]), "renter") + base = base_thresholds[tenure_str] + equiv_scale = spm_equivalence_scale( + int(num_adults[i]), int(num_children[i]) + ) + # No geographic adjustment for national-level thresholds + thresholds[i] = base * equiv_scale + + return thresholds + + +def map_tenure_acs_to_spm(tenure_type: np.ndarray) -> np.ndarray: + """ + Map ACS tenure type values to spm-calculator tenure codes. + + Args: + tenure_type: Array of ACS TEN values. + 1 = Owned with mortgage/loan + 2 = Owned free and clear + 3 = Rented + + Returns: + Array of tenure code integers matching Census SPM format. + """ + # ACS TEN codes map directly to Census SPM codes: + # ACS 1 (owned with mortgage) -> Census 1 (owner_with_mortgage) + # ACS 2 (owned outright) -> Census 2 (owner_without_mortgage) + # ACS 3 (rented) -> Census 3 (renter) + return np.where( + np.isin(tenure_type, [1, 2, 3]), + tenure_type, + 3, # Default to renter for unknown values + ) diff --git a/pyproject.toml b/pyproject.toml index 3d00d389..f80c7c7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dependencies = [ "sqlalchemy>=2.0.41", "sqlmodel>=0.0.24", "xlrd>=2.0.2", + "spm-calculator @ git+https://github.com/PolicyEngine/spm-calculator.git", ] [project.optional-dependencies] From e9fa47e0f200926fd2ee9ea7848572984ac5bb7b Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Thu, 4 Dec 2025 16:08:39 -0500 Subject: [PATCH 02/10] Use spm-calculator to compute SPM thresholds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add spm-calculator as a dependency (from GitHub) - Create policyengine_us_data/utils/spm.py with threshold calculation utilities - Update CPS dataset to use spm-calculator with Census-provided GEOADJ factors - Update ACS dataset to use spm-calculator with national-level thresholds For CPS: Uses the SPM_GEOADJ values from Census data combined with spm-calculator's base thresholds and equivalence scale, providing geographically-adjusted thresholds. For ACS: Uses national-level thresholds (no geographic adjustment) since ACS doesn't include pre-computed GEOADJ values. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- changelog_entry.yaml | 8 ++ policyengine_us_data/datasets/acs/acs.py | 51 +++++++++- policyengine_us_data/datasets/cps/cps.py | 23 ++++- policyengine_us_data/utils/spm.py | 122 +++++++++++++++++++++++ pyproject.toml | 1 + 5 files changed, 199 insertions(+), 6 deletions(-) create mode 100644 policyengine_us_data/utils/spm.py diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..c3d09844 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,8 @@ +- bump: minor + changes: + added: + - SPM threshold calculation using policyengine/spm-calculator package + - New utility module (policyengine_us_data/utils/spm.py) for SPM calculations + changed: + - CPS datasets now calculate SPM thresholds using spm-calculator with Census-provided geographic adjustments + - ACS datasets now calculate SPM thresholds using spm-calculator with national-level thresholds diff --git a/policyengine_us_data/datasets/acs/acs.py b/policyengine_us_data/datasets/acs/acs.py index 0ecd3ee7..836c2a1f 100644 --- a/policyengine_us_data/datasets/acs/acs.py +++ b/policyengine_us_data/datasets/acs/acs.py @@ -25,6 +25,7 @@ def generate(self) -> None: self.add_id_variables(acs, person, household) self.add_person_variables(acs, person, household) self.add_household_variables(acs, household) + self.add_spm_variables(acs, person, household, self.time_period) acs.close() raw_data.close() @@ -93,9 +94,53 @@ def add_person_variables( ) @staticmethod - def add_spm_variables(acs: h5py.File, spm_unit: DataFrame) -> None: - acs["spm_unit_net_income_reported"] = spm_unit.SPM_RESOURCES - acs["spm_unit_spm_threshold"] = spm_unit.SPM_POVTHRESHOLD + def add_spm_variables( + acs: h5py.File, + person: DataFrame, + household: DataFrame, + time_period: int, + ) -> None: + from policyengine_us_data.utils.spm import ( + calculate_spm_thresholds_national, + map_tenure_acs_to_spm, + ) + + # In ACS, SPM unit = household + # Calculate number of adults (18+) and children (<18) per household + person_with_hh = person.copy() + person_with_hh["is_adult"] = person_with_hh["AGEP"] >= 18 + person_with_hh["is_child"] = person_with_hh["AGEP"] < 18 + + hh_counts = ( + person_with_hh.groupby("household_id") + .agg({"is_adult": "sum", "is_child": "sum"}) + .rename( + columns={"is_adult": "num_adults", "is_child": "num_children"} + ) + ) + + # Ensure household is indexed properly + household_indexed = household.set_index("household_id") + + # Get counts aligned with household order + num_adults = hh_counts.loc[ + household_indexed.index, "num_adults" + ].values + num_children = hh_counts.loc[ + household_indexed.index, "num_children" + ].values + + # Map ACS tenure to SPM tenure codes + tenure_codes = map_tenure_acs_to_spm(household_indexed["TEN"].values) + + # Calculate SPM thresholds using national-level values + # (ACS doesn't have Census-provided geographic adjustments) + acs["spm_unit_spm_threshold"] = calculate_spm_thresholds_national( + num_adults=num_adults, + num_children=num_children, + tenure_codes=tenure_codes, + year=time_period, + ) @staticmethod def add_household_variables(acs: h5py.File, household: DataFrame) -> None: diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index f932e0d5..90fc72c3 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -78,7 +78,7 @@ def generate(self): undocumented_students_target=0.21 * 1.9e6, ) logging.info("Adding family variables") - add_spm_variables(cps, spm_unit) + add_spm_variables(cps, spm_unit, self.time_period) logging.info("Adding household variables") add_household_variables(cps, household) logging.info("Adding rent") @@ -602,7 +602,15 @@ def add_personal_income_variables( cps[f"{var}_would_be_qualified"] = rng.random(len(person)) < prob -def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None: +def add_spm_variables( + cps: h5py.File, + spm_unit: DataFrame, + time_period: int, +) -> None: + from policyengine_us_data.utils.spm import ( + calculate_spm_thresholds_with_geoadj, + ) + SPM_RENAMES = dict( spm_unit_total_income_reported="SPM_TOTVAL", snap_reported="SPM_SNAPSUB", @@ -616,7 +624,6 @@ def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None: # State tax includes refundable credits. spm_unit_state_tax_reported="SPM_STTAX", spm_unit_capped_work_childcare_expenses="SPM_CAPWKCCXPNS", - spm_unit_spm_threshold="SPM_POVTHRESHOLD", spm_unit_net_income_reported="SPM_RESOURCES", spm_unit_pre_subsidy_childcare_expenses="SPM_CHILDCAREXPNS", ) @@ -625,6 +632,16 @@ def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None: if asec_variable in spm_unit.columns: cps[openfisca_variable] = spm_unit[asec_variable] + # Calculate SPM thresholds using spm-calculator with Census-provided + # geographic adjustment factors (SPM_GEOADJ) + cps["spm_unit_spm_threshold"] = calculate_spm_thresholds_with_geoadj( + num_adults=spm_unit["SPM_NUMADULTS"].values, + num_children=spm_unit["SPM_NUMKIDS"].values, + tenure_codes=spm_unit["SPM_TENMORTSTATUS"].values, + geoadj=spm_unit["SPM_GEOADJ"].values, + year=time_period, + ) + cps["reduced_price_school_meals_reported"] = ( cps["free_school_meals_reported"] * 0 ) diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py new file mode 100644 index 00000000..05c04234 --- /dev/null +++ b/policyengine_us_data/utils/spm.py @@ -0,0 +1,122 @@ +"""SPM threshold calculation utilities using the spm-calculator package.""" + +import numpy as np +from spm_calculator import SPMCalculator, spm_equivalence_scale + + +# Census CPS SPM_TENMORTSTATUS codes to spm-calculator tenure mapping +# Based on IPUMS SPMMORT documentation: +# 1 = Owner with mortgage +# 2 = Owner without mortgage +# 3 = Renter +TENURE_CODE_MAP = { + 1: "owner_with_mortgage", + 2: "owner_without_mortgage", + 3: "renter", +} + + +def calculate_spm_thresholds_with_geoadj( + num_adults: np.ndarray, + num_children: np.ndarray, + tenure_codes: np.ndarray, + geoadj: np.ndarray, + year: int, +) -> np.ndarray: + """ + Calculate SPM thresholds using Census-provided geographic adjustments. + + This function uses the SPM_GEOADJ values already computed by the Census + Bureau, combined with spm-calculator's base thresholds and equivalence + scale formula. This avoids the need for a Census API key. + + Args: + num_adults: Array of number of adults (18+) in each SPM unit. + num_children: Array of number of children (<18) in each SPM unit. + tenure_codes: Array of Census tenure/mortgage status codes. + 1 = owner with mortgage, 2 = owner without mortgage, 3 = renter. + geoadj: Array of Census SPM_GEOADJ geographic adjustment factors. + year: The year for which to calculate thresholds. + + Returns: + Array of SPM threshold values. + """ + calc = SPMCalculator(year=year) + base_thresholds = calc.get_base_thresholds() + + n = len(num_adults) + thresholds = np.zeros(n) + + for i in range(n): + tenure_str = TENURE_CODE_MAP.get(int(tenure_codes[i]), "renter") + base = base_thresholds[tenure_str] + equiv_scale = spm_equivalence_scale( + int(num_adults[i]), int(num_children[i]) + ) + thresholds[i] = base * equiv_scale * geoadj[i] + + return thresholds + + +def calculate_spm_thresholds_national( + num_adults: np.ndarray, + num_children: np.ndarray, + tenure_codes: np.ndarray, + year: int, +) -> np.ndarray: + """ + Calculate SPM thresholds using national-level thresholds (no geoadj). + + This is used for datasets like ACS that don't have pre-computed + geographic adjustment factors. + + Args: + num_adults: Array of number of adults (18+) in each SPM unit. + num_children: Array of number of children (<18) in each SPM unit. + tenure_codes: Array of Census tenure/mortgage status codes. + 1 = owner with mortgage, 2 = owner without mortgage, 3 = renter. + year: The year for which to calculate thresholds. + + Returns: + Array of SPM threshold values using national averages. + """ + calc = SPMCalculator(year=year) + base_thresholds = calc.get_base_thresholds() + + n = len(num_adults) + thresholds = np.zeros(n) + + for i in range(n): + tenure_str = TENURE_CODE_MAP.get(int(tenure_codes[i]), "renter") + base = base_thresholds[tenure_str] + equiv_scale = spm_equivalence_scale( + int(num_adults[i]), int(num_children[i]) + ) + # No geographic adjustment for national-level thresholds + thresholds[i] = base * equiv_scale + + return thresholds + + +def map_tenure_acs_to_spm(tenure_type: np.ndarray) -> np.ndarray: + """ + Map ACS tenure type values to spm-calculator tenure codes. + + Args: + tenure_type: Array of ACS TEN values. + 1 = Owned with mortgage/loan + 2 = Owned free and clear + 3 = Rented + + Returns: + Array of tenure code integers matching Census SPM format. + """ + # ACS TEN codes map directly to Census SPM codes: + # ACS 1 (owned with mortgage) -> Census 1 (owner_with_mortgage) + # ACS 2 (owned outright) -> Census 2 (owner_without_mortgage) + # ACS 3 (rented) -> Census 3 (renter) + return np.where( + np.isin(tenure_type, [1, 2, 3]), + tenure_type, + 3, # Default to renter for unknown values + ) diff --git a/pyproject.toml b/pyproject.toml index 3d00d389..f80c7c7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dependencies = [ "sqlalchemy>=2.0.41", "sqlmodel>=0.0.24", "xlrd>=2.0.2", + "spm-calculator @ git+https://github.com/PolicyEngine/spm-calculator.git", ] [project.optional-dependencies] From d8a5beeb28d991f2f413f5c1d17f9ccc27c9a0b8 Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Fri, 5 Dec 2025 10:33:20 -0500 Subject: [PATCH 03/10] Clear notebook outputs to fix MyST documentation build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The long_term_projections.ipynb notebook outputs were causing a "useOutputsContext must be used within a OutputsContextProvider" error in MyST's book-theme during documentation build. Clearing the outputs resolves this issue. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- docs/long_term_projections.ipynb | 495 +------------------------------ 1 file changed, 16 insertions(+), 479 deletions(-) diff --git a/docs/long_term_projections.ipynb b/docs/long_term_projections.ipynb index 10b07b72..b99ab2ac 100644 --- a/docs/long_term_projections.ipynb +++ b/docs/long_term_projections.ipynb @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-11-19T19:56:10.232617Z", @@ -33,139 +33,7 @@ "shell.execute_reply": "2025-11-19T19:57:25.892382Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TEST_LITE == False\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "======================================================================\r\n", - "HOUSEHOLD-LEVEL INCOME TAX PROJECTION: 2025-2027\r\n", - "======================================================================\r\n", - "\r\n", - "Configuration:\r\n", - " Base year: 2024 (CPS microdata)\r\n", - " Projection: 2025-2027\r\n", - " Calculation level: HOUSEHOLD ONLY (simplified)\r\n", - " Calibration method: GREG\r\n", - " Including Social Security benefits constraint: Yes\r\n", - " Including taxable payroll constraint: Yes\r\n", - " Saving year-specific .h5 files: Yes (to ./projected_datasets/)\r\n", - " Years to process: 3\r\n", - " Estimated time: ~9 minutes\r\n", - "\r\n", - "======================================================================\r\n", - "STEP 1: DEMOGRAPHIC PROJECTIONS\r\n", - "======================================================================\r\n", - "\r\n", - "Loaded SSA projections: 86 ages x 3 years\r\n", - "\r\n", - "Population projections:\r\n", - " 2025: 346.6M\r\n", - " 2027: 350.9M\r\n", - "\r\n", - "======================================================================\r\n", - "STEP 2: BUILDING HOUSEHOLD AGE COMPOSITION\r\n", - "======================================================================\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r\n", - "Loaded 21,532 households\r\n", - "Household age matrix shape: (21532, 86)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r\n", - "======================================================================\r\n", - "STEP 3: HOUSEHOLD-LEVEL PROJECTION\r\n", - "======================================================================\r\n", - "\r\n", - "Methodology (SIMPLIFIED):\r\n", - " 1. PolicyEngine uprates to each projection year\r\n", - " 2. Calculate all values at household level (map_to='household')\r\n", - " 3. IPF/GREG adjusts weights to match SSA demographics\r\n", - " 4. Apply calibrated weights directly (no aggregation needed)\r\n", - "\r\n", - "Initial memory usage: 1.13 GB\r\n", - "\r\n", - "Year Population Income Tax Baseline Tax Memory\r\n", - "-----------------------------------------------------------------\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " [DEBUG 2025] SS baseline: $1424.6B, target: $1609.0B\r\n", - " [DEBUG 2025] Payroll baseline: $8950.9B, target: $10621.0B\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " [DEBUG 2025] SS achieved: $1609.0B (error: -0.0%)\r\n", - " [DEBUG 2025] Payroll achieved: $10621.0B (error: -0.0%)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Saved 2025.h5\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2025 346.6M $ 2543.1B $ 1882.9B 4.32GB\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " [DEBUG 2027] SS baseline: $1495.1B, target: $1799.9B\r\n", - " [DEBUG 2027] Payroll baseline: $9718.4B, target: $11627.0B\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " [DEBUG 2027] SS achieved: $1799.9B (error: -0.0%)\r\n", - " [DEBUG 2027] Payroll achieved: $11627.0B (error: 0.0%)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Saved 2027.h5\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2027 350.9M $ 2873.8B $ 2125.0B 4.62GB\r\n" - ] - } - ], + "outputs": [], "source": [ "# Save calibrated datasets as .h5 files for each year\n", "!python ../policyengine_us_data/datasets/cps/long_term/run_household_projection.py 2027 --greg --use-ss --use-payroll --save-h5" @@ -227,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-11-19T19:57:25.894248Z", @@ -236,23 +104,7 @@ "shell.execute_reply": "2025-11-19T19:57:28.259297Z" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/baogorek/envs/pe/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TEST_LITE == False\n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", @@ -264,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-11-19T19:57:28.260599Z", @@ -273,149 +125,7 @@ "shell.execute_reply": "2025-11-19T19:57:28.275040Z" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
YearAgeTotalM TotM SinM MarM WidM DivF TotF SinF MarF WidF Div
01941024925081276328127632800012161801216180000
11941123842901218524121852400011657661165766000
21941224450541246078124607800011989761198976000
31941323959991219543121954300011764561176456000
41941422754251157612115761200011178131117813000
\n", - "
" - ], - "text/plain": [ - " Year Age Total M Tot M Sin M Mar M Wid M Div F Tot \\\n", - "0 1941 0 2492508 1276328 1276328 0 0 0 1216180 \n", - "1 1941 1 2384290 1218524 1218524 0 0 0 1165766 \n", - "2 1941 2 2445054 1246078 1246078 0 0 0 1198976 \n", - "3 1941 3 2395999 1219543 1219543 0 0 0 1176456 \n", - "4 1941 4 2275425 1157612 1157612 0 0 0 1117813 \n", - "\n", - " F Sin F Mar F Wid F Div \n", - "0 1216180 0 0 0 \n", - "1 1165766 0 0 0 \n", - "2 1198976 0 0 0 \n", - "3 1176456 0 0 0 \n", - "4 1117813 0 0 0 " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Load SSA population data\n", "ssa_pop = pd.read_csv(STORAGE_FOLDER / 'SSPopJul_TR2024.csv')\n", @@ -424,7 +134,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-11-19T19:57:28.276266Z", @@ -433,101 +143,7 @@ "shell.execute_reply": "2025-11-19T19:57:28.279840Z" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
yearoasdi_cost_in_billion_2025_usdcpi_w_intermediateoasdi_cost_in_billion_nominal_usdtaxable_payroll_in_billion_nominal_usd
020251609100.001609.000010621.0
120261660102.491701.334011129.0
220271715104.951799.892511627.0
320281763107.471894.696112159.0
420291810110.051991.905012696.0
\n", - "
" - ], - "text/plain": [ - " year oasdi_cost_in_billion_2025_usd cpi_w_intermediate \\\n", - "0 2025 1609 100.00 \n", - "1 2026 1660 102.49 \n", - "2 2027 1715 104.95 \n", - "3 2028 1763 107.47 \n", - "4 2029 1810 110.05 \n", - "\n", - " oasdi_cost_in_billion_nominal_usd taxable_payroll_in_billion_nominal_usd \n", - "0 1609.0000 10621.0 \n", - "1 1701.3340 11129.0 \n", - "2 1799.8925 11627.0 \n", - "3 1894.6961 12159.0 \n", - "4 1991.9050 12696.0 " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Load Social Security auxiliary data\n", "ss_aux = pd.read_csv(STORAGE_FOLDER / 'social_security_aux.csv')\n", @@ -590,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-11-19T19:57:28.280998Z", @@ -609,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-11-19T19:57:28.283285Z", @@ -618,18 +234,7 @@ "shell.execute_reply": "2025-11-19T19:57:28.296600Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Age distribution targets for 2025:\n", - "Shape: (86, 1)\n", - "Total population: 346577.3M\n" - ] - } - ], + "outputs": [], "source": [ "# Get SSA population targets for a specific year\n", "year = 2025\n", @@ -641,7 +246,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-11-19T19:57:28.297698Z", @@ -650,16 +255,7 @@ "shell.execute_reply": "2025-11-19T19:57:28.299938Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Social Security benefit target for 2025: $1609.0B\n" - ] - } - ], + "outputs": [], "source": [ "# Get Social Security benefit target\n", "ss_target = load_ssa_benefit_projections(year)\n", @@ -707,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-11-19T19:57:28.300920Z", @@ -716,66 +312,7 @@ "shell.execute_reply": "2025-11-19T19:59:04.747122Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "================================================================================\n", - "WHARTON COMPARISON PIPELINE - YEAR 2054\n", - "================================================================================\n", - "\n", - "Running PolicyEngine analysis...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading dataset: hf://policyengine/test/2054.h5\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ranking according to quantiles with: household_net_income\n", - "✓ Analysis complete\n", - " Revenue impact: $-579.1B\n", - "\n", - "Generating comparison table...\n", - "\n", - "================================================================================\n", - "COMPARISON TABLE: 2054\n", - "================================================================================\n", - "\n", - "Average Tax Change (per household):\n", - " Income Group PolicyEngine Wharton Difference\n", - " First quintile $-95 $-5 $-90\n", - "Second quintile $-1,054 $-275 $-779\n", - "Middle quintile $-2,241 $-1,730 $-511\n", - "Fourth quintile $-4,633 $-3,560 $-1,073\n", - " 80-90% $-6,737 $-4,075 $-2,662\n", - " 90-95% $-12,121 $-4,385 $-7,736\n", - " 95-99% $-8,066 $-4,565 $-3,501\n", - " 99-99.9% $-7,257 $-4,820 $-2,437\n", - " Top 0.1% $-8,615 $-5,080 $-3,535\n", - "\n", - "Percent Change in Income:\n", - " Income Group PE % Wharton %\n", - " First quintile -0.10000000149011612% 0.0%\n", - "Second quintile 0.800000011920929% 0.3%\n", - "Middle quintile 1.100000023841858% 1.3%\n", - "Fourth quintile 1.399999976158142% 1.6%\n", - " 80-90% 1.399999976158142% 1.2%\n", - " 90-95% 1.7000000476837158% 0.9%\n", - " 95-99% 0.699999988079071% 0.6%\n", - " 99-99.9% 0.30000001192092896% 0.2%\n", - " Top 0.1% 0.10000000149011612% 0.0%\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "import sys\n", "import os\n", @@ -969,7 +506,7 @@ "# Run analysis\n", "print(\"Running PolicyEngine analysis...\")\n", "pe_results, revenue_impact = run_analysis(dataset_path, year, income_rank_variable)\n", - "print(f\"✓ Analysis complete\")\n", + "print(f\"\u2713 Analysis complete\")\n", "print(f\" Revenue impact: ${revenue_impact:.1f}B\")\n", "print()\n", "\n", From 74286690d914df83251f631297348eb847dbe071 Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Fri, 5 Dec 2025 11:08:52 -0500 Subject: [PATCH 04/10] Convert long_term_projections.ipynb to markdown to fix MyST build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Jupyter notebook was causing "useOutputsContext must be used within a OutputsContextProvider" errors in the MyST book-theme. Converting to markdown preserves all content while avoiding the rendering bug. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- docs/long_term_projections.ipynb | 552 ------------------------------- docs/long_term_projections.md | 348 +++++++++++++++++++ docs/myst.yml | 2 +- 3 files changed, 349 insertions(+), 553 deletions(-) delete mode 100644 docs/long_term_projections.ipynb create mode 100644 docs/long_term_projections.md diff --git a/docs/long_term_projections.ipynb b/docs/long_term_projections.ipynb deleted file mode 100644 index b99ab2ac..00000000 --- a/docs/long_term_projections.ipynb +++ /dev/null @@ -1,552 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": "# Long Term Projections\n## Integrating Economic Uprating with Demographic Reweighting" - }, - { - "cell_type": "markdown", - "source": "## Executive Summary\n\nThis document outlines an innovative approach for projecting federal income tax revenue through 2100 that uniquely combines sophisticated economic microsimulation with demographic reweighting. By harmonizing PolicyEngine's state-of-the-art tax modeling with Social Security Administration demographic projections, we can isolate and quantify the fiscal impact of population aging while preserving the full complexity of the tax code.", - "metadata": {} - }, - { - "cell_type": "markdown", - "source": "## The Challenge\n\nProjecting tax revenue over a 75-year horizon requires simultaneously modeling two distinct but interrelated dynamics:\n\n**Economic Evolution**: How incomes, prices, and tax parameters change over time\n- Wage growth and income distribution shifts\n- Inflation affecting brackets and deductions\n- Legislative changes and indexing rules\n- Behavioral responses to tax policy\n\n**Demographic Transformation**: How the population structure evolves\n- Baby boom generation aging through retirement\n- Declining birth rates reducing working-age population\n- Increasing longevity extending retirement duration\n- Shifting household composition patterns\n\nTraditional approaches typically sacrifice either economic sophistication (using simplified tax calculations) or demographic realism (holding age distributions constant). Our methodology preserves both.", - "metadata": {} - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run projections using `run_household_projection.py`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2025-11-19T19:56:10.232617Z", - "iopub.status.busy": "2025-11-19T19:56:10.232523Z", - "iopub.status.idle": "2025-11-19T19:57:25.892674Z", - "shell.execute_reply": "2025-11-19T19:57:25.892382Z" - } - }, - "outputs": [], - "source": [ - "# Save calibrated datasets as .h5 files for each year\n", - "!python ../policyengine_us_data/datasets/cps/long_term/run_household_projection.py 2027 --greg --use-ss --use-payroll --save-h5" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Arguments:**\n", - "- `END_YEAR`: Target year for projection (default: 2035)\n", - "- `--greg`: Use GREG calibration instead of IPF (optional)\n", - "- `--use-ss`: Include Social Security benefit totals as calibration target (requires --greg)\n", - "- `--use-payroll`: Include taxable payroll as calibration target (requires --greg)\n", - "- `--save-h5`: Save year-specific .h5 files to `./projected_datasets/` directory" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## The Challenge\n", - "\n", - "Projecting tax revenue over a 75-year horizon requires simultaneously modeling two distinct but interrelated dynamics:\n", - "\n", - "**Economic Evolution**: How incomes, prices, and tax parameters change over time\n", - "- Wage growth and income distribution shifts\n", - "- Inflation affecting brackets and deductions\n", - "- Legislative changes and indexing rules\n", - "- Behavioral responses to tax policy\n", - "\n", - "**Demographic Transformation**: How the population structure evolves\n", - "- Baby boom generation aging through retirement\n", - "- Declining birth rates reducing working-age population\n", - "- Increasing longevity extending retirement duration\n", - "- Shifting household composition patterns\n", - "\n", - "Traditional approaches typically sacrifice either economic sophistication (using simplified tax calculations) or demographic realism (holding age distributions constant). Our methodology preserves both." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Sources\n", - "\n", - "The long-term projections use two key SSA datasets:\n", - "\n", - "1. **SSA Population Projections** (`SSPopJul_TR2024.csv`)\n", - " - Source: [SSA 2024 Trustees Report - Single Year Age Demographic Projections](https://www.ssa.gov/oact/HistEst/Population/2024/Population2024.html)\n", - " - Contains age-specific population projections through 2100\n", - " - Used for demographic reweighting to match future population structure\n", - "\n", - "2. **Social Security Cost Projections** (`social_security_aux.csv`)\n", - " - Source: [SSA 2025 Trustees Report, Table VI.G9](https://www.ssa.gov/oact/TR/2025/index.html)\n", - " - Contains OASDI benefit cost projections in CPI-indexed 2025 dollars\n", - " - Used as calibration target in GREG method to ensure fiscal consistency" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2025-11-19T19:57:25.894248Z", - "iopub.status.busy": "2025-11-19T19:57:25.894165Z", - "iopub.status.idle": "2025-11-19T19:57:28.259561Z", - "shell.execute_reply": "2025-11-19T19:57:28.259297Z" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "from pathlib import Path\n", - "\n", - "from policyengine_us_data.storage import STORAGE_FOLDER" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2025-11-19T19:57:28.260599Z", - "iopub.status.busy": "2025-11-19T19:57:28.260502Z", - "iopub.status.idle": "2025-11-19T19:57:28.275248Z", - "shell.execute_reply": "2025-11-19T19:57:28.275040Z" - } - }, - "outputs": [], - "source": [ - "# Load SSA population data\n", - "ssa_pop = pd.read_csv(STORAGE_FOLDER / 'SSPopJul_TR2024.csv')\n", - "ssa_pop.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2025-11-19T19:57:28.276266Z", - "iopub.status.busy": "2025-11-19T19:57:28.276171Z", - "iopub.status.idle": "2025-11-19T19:57:28.280120Z", - "shell.execute_reply": "2025-11-19T19:57:28.279840Z" - } - }, - "outputs": [], - "source": [ - "# Load Social Security auxiliary data\n", - "ss_aux = pd.read_csv(STORAGE_FOLDER / 'social_security_aux.csv')\n", - "ss_aux.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Core Innovation\n", - "\n", - "Our approach operates in two complementary stages:\n", - "\n", - "### Stage 1: Economic Uprating\n", - "\n", - "PolicyEngine's microsimulation engine projects each household's economic circumstances forward using:\n", - "\n", - "**Sophisticated Income Modeling**\n", - "\n", - "The system models 17 distinct income categories, each uprated according to its economic fundamentals:\n", - "\n", - "*Primary Categories with Specific Projections:*\n", - "- Employment income (wages) - follows CBO wage growth projections\n", - "- Self-employment income - follows CBO business income projections\n", - "- Capital gains - follows CBO asset appreciation projections\n", - "- Interest income - follows CBO interest rate projections\n", - "- Dividend income - follows CBO corporate profit projections\n", - "- Pension income - follows CBO retirement income projections\n", - "- Social Security - follows SSA COLA projections (available through 2100)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Stage 2: Demographic Reweighting\n", - "\n", - "We offer two calibration methods for adjusting household weights to match SSA projections:\n", - "\n", - "**Method 1: Iterative Proportional Fitting (IPF)**\n", - "- Traditional raking approach using Kullback-Leibler divergence\n", - "- Iteratively adjusts weights to match marginal distributions\n", - "- Robust to specification and always produces non-negative weights\n", - "- Default method for backward compatibility\n", - "\n", - "**Method 2: Generalized Regression (GREG) Calibration**\n", - "- Modern calibration using chi-squared distance minimization\n", - "- Enables simultaneous calibration to categorical AND continuous variables\n", - "- Direct solution via matrix operations (no iteration needed)\n", - "- Required for incorporating Social Security benefit constraints" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Demonstrating the Calibration Methods" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2025-11-19T19:57:28.280998Z", - "iopub.status.busy": "2025-11-19T19:57:28.280918Z", - "iopub.status.idle": "2025-11-19T19:57:28.282636Z", - "shell.execute_reply": "2025-11-19T19:57:28.282472Z" - } - }, - "outputs": [], - "source": [ - "from policyengine_us_data.datasets.cps.long_term.ssa_data import (\n", - " load_ssa_age_projections,\n", - " load_ssa_benefit_projections\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2025-11-19T19:57:28.283285Z", - "iopub.status.busy": "2025-11-19T19:57:28.283217Z", - "iopub.status.idle": "2025-11-19T19:57:28.296829Z", - "shell.execute_reply": "2025-11-19T19:57:28.296600Z" - } - }, - "outputs": [], - "source": [ - "# Get SSA population targets for a specific year\n", - "year = 2025\n", - "age_targets = load_ssa_age_projections(end_year=year)\n", - "print(f\"\\nAge distribution targets for {year}:\")\n", - "print(f\"Shape: {age_targets.shape}\")\n", - "print(f\"Total population: {age_targets[:, 0].sum() / 1000:.1f}M\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2025-11-19T19:57:28.297698Z", - "iopub.status.busy": "2025-11-19T19:57:28.297611Z", - "iopub.status.idle": "2025-11-19T19:57:28.300109Z", - "shell.execute_reply": "2025-11-19T19:57:28.299938Z" - } - }, - "outputs": [], - "source": [ - "# Get Social Security benefit target\n", - "ss_target = load_ssa_benefit_projections(year)\n", - "print(f\"\\nSocial Security benefit target for {year}: ${ss_target / 1e9:.1f}B\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### PWBM Analysis: Eliminating Income Taxes on Social Security Benefits\n", - "\n", - "**Source:** [Eliminating Income Taxes on Social Security Benefits](https://budgetmodel.wharton.upenn.edu/issues/2025/2/10/eliminating-income-taxes-on-social-security-benefits) (Penn Wharton Budget Model, February 10, 2025)\n", - "\n", - "---\n", - "\n", - "#### Policy Analyzed\n", - "The Penn Wharton Budget Model (PWBM) analyzed a policy proposal to permanently eliminate all income taxes on Social Security benefits, effective January 1, 2025.\n", - "\n", - "#### Key Findings\n", - "\n", - "* **Budgetary Impact:** The policy is projected to reduce federal revenues by **$1.45 trillion** over the 10-year budget window (2025-2034). Over the long term, it is projected to increase federal debt by 7 percent by 2054, relative to the current baseline.\n", - "\n", - "* **Macroeconomic Impact:** The analysis finds the policy would have negative long-term effects on the economy.\n", - " * It reduces incentives for households to save for retirement and to work.\n", - " * This leads to a smaller capital stock (projected to be 4.2% lower by 2054).\n", - " * The smaller capital stock results in lower average wages (1.8% lower by 2054) and lower GDP (2.1% lower by 2054).\n", - "\n", - "* **Conventional Distributional Impact (Your Table):** The table you shared shows the annual \"conventional\" effects on household after-tax income.\n", - " * The largest average *dollar* tax cuts go to households in the top 20 percent of the income distribution (quintiles 80-100%).\n", - " * The largest *relative* gains (as a percentage of income) go to households in the fourth quintile (60-80%), who see a 1.6% increase in after-tax income by 2054.\n", - " * The dollar amounts shown are in **nominal dollars** for each specified year, not adjusted to a single base year.\n", - "\n", - "* **Dynamic (Lifetime) Impact:** When analyzing the policy's effects over a household's entire lifetime, PWBM finds:\n", - " * The policy primarily benefits high-income households who are nearing or in retirement.\n", - " * It negatively impacts all households under the age of 30 and all future generations, who would experience a net welfare loss due to the long-term effects of lower wages and higher federal debt." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### PolicyEngine's Analysis of Eliminating Income Taxes on Social Security Benefits" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2025-11-19T19:57:28.300920Z", - "iopub.status.busy": "2025-11-19T19:57:28.300844Z", - "iopub.status.idle": "2025-11-19T19:59:04.747344Z", - "shell.execute_reply": "2025-11-19T19:59:04.747122Z" - } - }, - "outputs": [], - "source": [ - "import sys\n", - "import os\n", - "import pandas as pd\n", - "import numpy as np\n", - "import gc\n", - "\n", - "from policyengine_us import Microsimulation\n", - "from policyengine_core.reforms import Reform\n", - "\n", - "WHARTON_BENCHMARKS = {\n", - " 2026: {\n", - " 'First quintile': {'tax_change': 0, 'pct_change': 0.0},\n", - " 'Second quintile': {'tax_change': -15, 'pct_change': 0.0},\n", - " 'Middle quintile': {'tax_change': -340, 'pct_change': 0.5},\n", - " 'Fourth quintile': {'tax_change': -1135, 'pct_change': 1.1},\n", - " '80-90%': {'tax_change': -1625, 'pct_change': 1.0},\n", - " '90-95%': {'tax_change': -1590, 'pct_change': 0.7},\n", - " '95-99%': {'tax_change': -2020, 'pct_change': 0.5},\n", - " '99-99.9%': {'tax_change': -2205, 'pct_change': 0.2},\n", - " 'Top 0.1%': {'tax_change': -2450, 'pct_change': 0.0},\n", - " },\n", - " 2034: {\n", - " 'First quintile': {'tax_change': 0, 'pct_change': 0.0},\n", - " 'Second quintile': {'tax_change': -45, 'pct_change': 0.1},\n", - " 'Middle quintile': {'tax_change': -615, 'pct_change': 0.8},\n", - " 'Fourth quintile': {'tax_change': -1630, 'pct_change': 1.2},\n", - " '80-90%': {'tax_change': -2160, 'pct_change': 1.1},\n", - " '90-95%': {'tax_change': -2160, 'pct_change': 0.7},\n", - " '95-99%': {'tax_change': -2605, 'pct_change': 0.6},\n", - " '99-99.9%': {'tax_change': -2715, 'pct_change': 0.2},\n", - " 'Top 0.1%': {'tax_change': -2970, 'pct_change': 0.0},\n", - " },\n", - " 2054: {\n", - " 'First quintile': {'tax_change': -5, 'pct_change': 0.0},\n", - " 'Second quintile': {'tax_change': -275, 'pct_change': 0.3},\n", - " 'Middle quintile': {'tax_change': -1730, 'pct_change': 1.3},\n", - " 'Fourth quintile': {'tax_change': -3560, 'pct_change': 1.6},\n", - " '80-90%': {'tax_change': -4075, 'pct_change': 1.2},\n", - " '90-95%': {'tax_change': -4385, 'pct_change': 0.9},\n", - " '95-99%': {'tax_change': -4565, 'pct_change': 0.6},\n", - " '99-99.9%': {'tax_change': -4820, 'pct_change': 0.2},\n", - " 'Top 0.1%': {'tax_change': -5080, 'pct_change': 0.0},\n", - " },\n", - "}\n", - "\n", - "def run_analysis(dataset_path, year, income_rank_var = \"household_net_income\"):\n", - " \"\"\"Run Option 1 analysis for given dataset and year\"\"\"\n", - "\n", - " option1_reform = Reform.from_dict(\n", - " {\n", - " # Base rate parameters (0-50% bracket)\n", - " \"gov.irs.social_security.taxability.rate.base.benefit_cap\": {\n", - " \"2026-01-01.2100-12-31\": 0\n", - " },\n", - " \"gov.irs.social_security.taxability.rate.base.excess\": {\n", - " \"2026-01-01.2100-12-31\": 0\n", - " },\n", - " # Additional rate parameters (50-85% bracket)\n", - " \"gov.irs.social_security.taxability.rate.additional.benefit_cap\": {\n", - " \"2026-01-01.2100-12-31\": 0\n", - " },\n", - " \"gov.irs.social_security.taxability.rate.additional.bracket\": {\n", - " \"2026-01-01.2100-12-31\": 0\n", - " },\n", - " \"gov.irs.social_security.taxability.rate.additional.excess\": {\n", - " \"2026-01-01.2100-12-31\": 0\n", - " }\n", - " }, country_id=\"us\"\n", - " )\n", - " reform = Microsimulation(dataset=dataset_path, reform=option1_reform)\n", - "\n", - " # Get household data\n", - " household_net_income_reform = reform.calculate(\"household_net_income\", period=year, map_to=\"household\")\n", - " household_agi_reform = reform.calculate(\"adjusted_gross_income\", period=year, map_to=\"household\")\n", - " income_tax_reform = reform.calculate(\"income_tax\", period=year, map_to=\"household\")\n", - "\n", - " del reform\n", - " gc.collect()\n", - "\n", - " print(f\"Loading dataset: {dataset_path}\")\n", - " baseline = Microsimulation(dataset=dataset_path)\n", - " household_weight = baseline.calculate(\"household_weight\", period=year)\n", - " household_net_income_baseline = baseline.calculate(\"household_net_income\", period=year, map_to=\"household\")\n", - " household_agi_baseline = baseline.calculate(\"adjusted_gross_income\", period=year, map_to=\"household\")\n", - " income_tax_baseline = baseline.calculate(\"income_tax\", period=year, map_to=\"household\")\n", - "\n", - " # Calculate changes\n", - " tax_change = income_tax_reform - income_tax_baseline\n", - " income_change_pct = (\n", - " (household_net_income_reform - household_net_income_baseline) / household_net_income_baseline\n", - " ) * 100\n", - "\n", - " # Create DataFrame\n", - " df = pd.DataFrame({\n", - " 'household_net_income': household_net_income_baseline,\n", - " 'weight': household_weight,\n", - " 'tax_change': tax_change,\n", - " 'income_change_pct': income_change_pct,\n", - " 'income_rank_var': baseline.calculate(income_rank_var, year, map_to=\"household\")\n", - " })\n", - " \n", - " # Calculate percentiles\n", - "\n", - " print(f\"Ranking according to quantiles with: {income_rank_var}\")\n", - " df['income_percentile'] = df['income_rank_var'].rank(pct=True) * 100\n", - "\n", - " # Assign income groups\n", - " def assign_income_group(percentile):\n", - " if percentile <= 20:\n", - " return 'First quintile'\n", - " elif percentile <= 40:\n", - " return 'Second quintile'\n", - " elif percentile <= 60:\n", - " return 'Middle quintile'\n", - " elif percentile <= 80:\n", - " return 'Fourth quintile'\n", - " elif percentile <= 90:\n", - " return '80-90%'\n", - " elif percentile <= 95:\n", - " return '90-95%'\n", - " elif percentile <= 99:\n", - " return '95-99%'\n", - " elif percentile <= 99.9:\n", - " return '99-99.9%'\n", - " else:\n", - " return 'Top 0.1%'\n", - "\n", - " df['income_group'] = df['income_percentile'].apply(assign_income_group)\n", - "\n", - " # Calculate aggregate revenue\n", - " revenue_impact = (income_tax_reform.sum() - income_tax_baseline.sum()) / 1e9\n", - "\n", - " # Calculate by group\n", - " results = []\n", - " for group in ['First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile',\n", - " '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%']:\n", - " group_data = df[df['income_group'] == group]\n", - " if len(group_data) == 0:\n", - " continue\n", - "\n", - " total_weight = group_data['weight'].sum()\n", - " avg_tax_change = (group_data['tax_change'] * group_data['weight']).sum() / total_weight\n", - " avg_income_change_pct = (group_data['income_change_pct'] * group_data['weight']).sum() / total_weight\n", - "\n", - " results.append({\n", - " 'group': group,\n", - " 'pe_tax_change': round(avg_tax_change),\n", - " 'pe_pct_change': round(avg_income_change_pct, 1),\n", - " })\n", - "\n", - " return pd.DataFrame(results), revenue_impact\n", - "\n", - "def generate_comparison_table(pe_results, year):\n", - " \"\"\"Generate comparison table with Wharton benchmark\"\"\"\n", - "\n", - " if year not in WHARTON_BENCHMARKS:\n", - " print(f\"Warning: No Wharton benchmark available for year {year}\")\n", - " return pe_results\n", - "\n", - " wharton_data = WHARTON_BENCHMARKS[year]\n", - "\n", - " comparison = []\n", - " for _, row in pe_results.iterrows():\n", - " group = row['group']\n", - " wharton = wharton_data.get(group, {'tax_change': None, 'pct_change': None})\n", - "\n", - " pe_tax = row['pe_tax_change']\n", - " wh_tax = wharton['tax_change']\n", - "\n", - " comparison.append({\n", - " 'Income Group': group,\n", - " 'PolicyEngine': f\"${pe_tax:,}\",\n", - " 'Wharton': f\"${wh_tax:,}\" if wh_tax is not None else 'N/A',\n", - " 'Difference': f\"${(pe_tax - wh_tax):,}\" if wh_tax is not None else 'N/A',\n", - " 'PE %': f\"{row['pe_pct_change']}%\",\n", - " 'Wharton %': f\"{wharton['pct_change']}%\" if wharton['pct_change'] is not None else 'N/A',\n", - " })\n", - "\n", - " return pd.DataFrame(comparison)\n", - "\n", - "dataset_path = 'hf://policyengine/test/2054.h5'\n", - "year = 2054\n", - "income_rank_variable = \"household_net_income\"\n", - "\n", - "print(\"=\"*80)\n", - "print(f\"WHARTON COMPARISON PIPELINE - YEAR {year}\")\n", - "print(\"=\"*80)\n", - "print()\n", - "\n", - "# Run analysis\n", - "print(\"Running PolicyEngine analysis...\")\n", - "pe_results, revenue_impact = run_analysis(dataset_path, year, income_rank_variable)\n", - "print(f\"\u2713 Analysis complete\")\n", - "print(f\" Revenue impact: ${revenue_impact:.1f}B\")\n", - "print()\n", - "\n", - "# Generate comparison table\n", - "print(\"Generating comparison table...\")\n", - "comparison_table = generate_comparison_table(pe_results, year)\n", - "\n", - "print()\n", - "print(\"=\"*80)\n", - "print(f\"COMPARISON TABLE: {year}\")\n", - "print(\"=\"*80)\n", - "print()\n", - "print(\"Average Tax Change (per household):\")\n", - "print(comparison_table[['Income Group', 'PolicyEngine', 'Wharton', 'Difference']].to_string(index=False))\n", - "print()\n", - "print(\"Percent Change in Income:\")\n", - "print(comparison_table[['Income Group', 'PE %', 'Wharton %']].to_string(index=False))\n", - "print()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/docs/long_term_projections.md b/docs/long_term_projections.md new file mode 100644 index 00000000..cc5d14c0 --- /dev/null +++ b/docs/long_term_projections.md @@ -0,0 +1,348 @@ +# Long Term Projections +## Integrating Economic Uprating with Demographic Reweighting + +## Executive Summary + +This document outlines an innovative approach for projecting federal income tax revenue through 2100 that uniquely combines sophisticated economic microsimulation with demographic reweighting. By harmonizing PolicyEngine's state-of-the-art tax modeling with Social Security Administration demographic projections, we can isolate and quantify the fiscal impact of population aging while preserving the full complexity of the tax code. + +## The Challenge + +Projecting tax revenue over a 75-year horizon requires simultaneously modeling two distinct but interrelated dynamics: + +**Economic Evolution**: How incomes, prices, and tax parameters change over time +- Wage growth and income distribution shifts +- Inflation affecting brackets and deductions +- Legislative changes and indexing rules +- Behavioral responses to tax policy + +**Demographic Transformation**: How the population structure evolves +- Baby boom generation aging through retirement +- Declining birth rates reducing working-age population +- Increasing longevity extending retirement duration +- Shifting household composition patterns + +Traditional approaches typically sacrifice either economic sophistication (using simplified tax calculations) or demographic realism (holding age distributions constant). Our methodology preserves both. + +## Running Projections + +Run projections using `run_household_projection.py`: + +```bash +# Save calibrated datasets as .h5 files for each year +python ../policyengine_us_data/datasets/cps/long_term/run_household_projection.py 2027 --greg --use-ss --use-payroll --save-h5 +``` + +**Arguments:** +- `END_YEAR`: Target year for projection (default: 2035) +- `--greg`: Use GREG calibration instead of IPF (optional) +- `--use-ss`: Include Social Security benefit totals as calibration target (requires --greg) +- `--use-payroll`: Include taxable payroll as calibration target (requires --greg) +- `--save-h5`: Save year-specific .h5 files to `./projected_datasets/` directory + +## Data Sources + +The long-term projections use two key SSA datasets: + +1. **SSA Population Projections** (`SSPopJul_TR2024.csv`) + - Source: [SSA 2024 Trustees Report - Single Year Age Demographic Projections](https://www.ssa.gov/oact/HistEst/Population/2024/Population2024.html) + - Contains age-specific population projections through 2100 + - Used for demographic reweighting to match future population structure + +2. **Social Security Cost Projections** (`social_security_aux.csv`) + - Source: [SSA 2025 Trustees Report, Table VI.G9](https://www.ssa.gov/oact/TR/2025/index.html) + - Contains OASDI benefit cost projections in CPI-indexed 2025 dollars + - Used as calibration target in GREG method to ensure fiscal consistency + +### Loading SSA Data + +```python +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from pathlib import Path + +from policyengine_us_data.storage import STORAGE_FOLDER + +# Load SSA population data +ssa_pop = pd.read_csv(STORAGE_FOLDER / 'SSPopJul_TR2024.csv') +ssa_pop.head() + +# Load Social Security auxiliary data +ss_aux = pd.read_csv(STORAGE_FOLDER / 'social_security_aux.csv') +ss_aux.head() +``` + +## Core Innovation + +Our approach operates in two complementary stages: + +### Stage 1: Economic Uprating + +PolicyEngine's microsimulation engine projects each household's economic circumstances forward using: + +**Sophisticated Income Modeling** + +The system models 17 distinct income categories, each uprated according to its economic fundamentals: + +*Primary Categories with Specific Projections:* +- Employment income (wages) - follows CBO wage growth projections +- Self-employment income - follows CBO business income projections +- Capital gains - follows CBO asset appreciation projections +- Interest income - follows CBO interest rate projections +- Dividend income - follows CBO corporate profit projections +- Pension income - follows CBO retirement income projections +- Social Security - follows SSA COLA projections (available through 2100) + +### Stage 2: Demographic Reweighting + +We offer two calibration methods for adjusting household weights to match SSA projections: + +**Method 1: Iterative Proportional Fitting (IPF)** +- Traditional raking approach using Kullback-Leibler divergence +- Iteratively adjusts weights to match marginal distributions +- Robust to specification and always produces non-negative weights +- Default method for backward compatibility + +**Method 2: Generalized Regression (GREG) Calibration** +- Modern calibration using chi-squared distance minimization +- Enables simultaneous calibration to categorical AND continuous variables +- Direct solution via matrix operations (no iteration needed) +- Required for incorporating Social Security benefit constraints + +## Demonstrating the Calibration Methods + +```python +from policyengine_us_data.datasets.cps.long_term.ssa_data import ( + load_ssa_age_projections, + load_ssa_benefit_projections +) + +# Get SSA population targets for a specific year +year = 2025 +age_targets = load_ssa_age_projections(end_year=year) +print(f"\nAge distribution targets for {year}:") +print(f"Shape: {age_targets.shape}") +print(f"Total population: {age_targets[:, 0].sum() / 1000:.1f}M") + +# Get Social Security benefit target +ss_target = load_ssa_benefit_projections(year) +print(f"\nSocial Security benefit target for {year}: ${ss_target / 1e9:.1f}B") +``` + +## PWBM Analysis: Eliminating Income Taxes on Social Security Benefits + +**Source:** [Eliminating Income Taxes on Social Security Benefits](https://budgetmodel.wharton.upenn.edu/issues/2025/2/10/eliminating-income-taxes-on-social-security-benefits) (Penn Wharton Budget Model, February 10, 2025) + +--- + +### Policy Analyzed +The Penn Wharton Budget Model (PWBM) analyzed a policy proposal to permanently eliminate all income taxes on Social Security benefits, effective January 1, 2025. + +### Key Findings + +* **Budgetary Impact:** The policy is projected to reduce federal revenues by **$1.45 trillion** over the 10-year budget window (2025-2034). Over the long term, it is projected to increase federal debt by 7 percent by 2054, relative to the current baseline. + +* **Macroeconomic Impact:** The analysis finds the policy would have negative long-term effects on the economy. + * It reduces incentives for households to save for retirement and to work. + * This leads to a smaller capital stock (projected to be 4.2% lower by 2054). + * The smaller capital stock results in lower average wages (1.8% lower by 2054) and lower GDP (2.1% lower by 2054). + +* **Conventional Distributional Impact (Your Table):** The table you shared shows the annual "conventional" effects on household after-tax income. + * The largest average *dollar* tax cuts go to households in the top 20 percent of the income distribution (quintiles 80-100%). + * The largest *relative* gains (as a percentage of income) go to households in the fourth quintile (60-80%), who see a 1.6% increase in after-tax income by 2054. + * The dollar amounts shown are in **nominal dollars** for each specified year, not adjusted to a single base year. + +* **Dynamic (Lifetime) Impact:** When analyzing the policy's effects over a household's entire lifetime, PWBM finds: + * The policy primarily benefits high-income households who are nearing or in retirement. + * It negatively impacts all households under the age of 30 and all future generations, who would experience a net welfare loss due to the long-term effects of lower wages and higher federal debt. + +## PolicyEngine's Analysis of Eliminating Income Taxes on Social Security Benefits + +```python +import sys +import os +import pandas as pd +import numpy as np +import gc + +from policyengine_us import Microsimulation +from policyengine_core.reforms import Reform + +WHARTON_BENCHMARKS = { + 2026: { + 'First quintile': {'tax_change': 0, 'pct_change': 0.0}, + 'Second quintile': {'tax_change': -15, 'pct_change': 0.0}, + 'Middle quintile': {'tax_change': -340, 'pct_change': 0.5}, + 'Fourth quintile': {'tax_change': -1135, 'pct_change': 1.1}, + '80-90%': {'tax_change': -1625, 'pct_change': 1.0}, + '90-95%': {'tax_change': -1590, 'pct_change': 0.7}, + '95-99%': {'tax_change': -2020, 'pct_change': 0.5}, + '99-99.9%': {'tax_change': -2205, 'pct_change': 0.2}, + 'Top 0.1%': {'tax_change': -2450, 'pct_change': 0.0}, + }, + 2034: { + 'First quintile': {'tax_change': 0, 'pct_change': 0.0}, + 'Second quintile': {'tax_change': -45, 'pct_change': 0.1}, + 'Middle quintile': {'tax_change': -615, 'pct_change': 0.8}, + 'Fourth quintile': {'tax_change': -1630, 'pct_change': 1.2}, + '80-90%': {'tax_change': -2160, 'pct_change': 1.1}, + '90-95%': {'tax_change': -2160, 'pct_change': 0.7}, + '95-99%': {'tax_change': -2605, 'pct_change': 0.6}, + '99-99.9%': {'tax_change': -2715, 'pct_change': 0.2}, + 'Top 0.1%': {'tax_change': -2970, 'pct_change': 0.0}, + }, + 2054: { + 'First quintile': {'tax_change': -5, 'pct_change': 0.0}, + 'Second quintile': {'tax_change': -275, 'pct_change': 0.3}, + 'Middle quintile': {'tax_change': -1730, 'pct_change': 1.3}, + 'Fourth quintile': {'tax_change': -3560, 'pct_change': 1.6}, + '80-90%': {'tax_change': -4075, 'pct_change': 1.2}, + '90-95%': {'tax_change': -4385, 'pct_change': 0.9}, + '95-99%': {'tax_change': -4565, 'pct_change': 0.6}, + '99-99.9%': {'tax_change': -4820, 'pct_change': 0.2}, + 'Top 0.1%': {'tax_change': -5080, 'pct_change': 0.0}, + }, +} + +def run_analysis(dataset_path, year, income_rank_var = "household_net_income"): + """Run Option 1 analysis for given dataset and year""" + + option1_reform = Reform.from_dict( + { + # Base rate parameters (0-50% bracket) + "gov.irs.social_security.taxability.rate.base.benefit_cap": { + "2026-01-01.2100-12-31": 0 + }, + "gov.irs.social_security.taxability.rate.base.excess": { + "2026-01-01.2100-12-31": 0 + }, + # Additional rate parameters (50-85% bracket) + "gov.irs.social_security.taxability.rate.additional.benefit_cap": { + "2026-01-01.2100-12-31": 0 + }, + "gov.irs.social_security.taxability.rate.additional.bracket": { + "2026-01-01.2100-12-31": 0 + }, + "gov.irs.social_security.taxability.rate.additional.excess": { + "2026-01-01.2100-12-31": 0 + } + }, country_id="us" + ) + reform = Microsimulation(dataset=dataset_path, reform=option1_reform) + + # Get household data + household_net_income_reform = reform.calculate("household_net_income", period=year, map_to="household") + household_agi_reform = reform.calculate("adjusted_gross_income", period=year, map_to="household") + income_tax_reform = reform.calculate("income_tax", period=year, map_to="household") + + del reform + gc.collect() + + print(f"Loading dataset: {dataset_path}") + baseline = Microsimulation(dataset=dataset_path) + household_weight = baseline.calculate("household_weight", period=year) + household_net_income_baseline = baseline.calculate("household_net_income", period=year, map_to="household") + household_agi_baseline = baseline.calculate("adjusted_gross_income", period=year, map_to="household") + income_tax_baseline = baseline.calculate("income_tax", period=year, map_to="household") + + # Calculate changes + tax_change = income_tax_reform - income_tax_baseline + income_change_pct = ( + (household_net_income_reform - household_net_income_baseline) / household_net_income_baseline + ) * 100 + + # Create DataFrame + df = pd.DataFrame({ + 'household_net_income': household_net_income_baseline, + 'weight': household_weight, + 'tax_change': tax_change, + 'income_change_pct': income_change_pct, + 'income_rank_var': baseline.calculate(income_rank_var, year, map_to="household") + }) + + # Calculate percentiles + + print(f"Ranking according to quantiles with: {income_rank_var}") + df['income_percentile'] = df['income_rank_var'].rank(pct=True) * 100 + + # Assign income groups + def assign_income_group(percentile): + if percentile <= 20: + return 'First quintile' + elif percentile <= 40: + return 'Second quintile' + elif percentile <= 60: + return 'Middle quintile' + elif percentile <= 80: + return 'Fourth quintile' + elif percentile <= 90: + return '80-90%' + elif percentile <= 95: + return '90-95%' + elif percentile <= 99: + return '95-99%' + elif percentile <= 99.9: + return '99-99.9%' + else: + return 'Top 0.1%' + + df['income_group'] = df['income_percentile'].apply(assign_income_group) + + # Calculate aggregate revenue + revenue_impact = (income_tax_reform.sum() - income_tax_baseline.sum()) / 1e9 + + # Calculate by group + results = [] + for group in ['First quintile', 'Second quintile', 'Middle quintile', 'Fourth quintile', + '80-90%', '90-95%', '95-99%', '99-99.9%', 'Top 0.1%']: + group_data = df[df['income_group'] == group] + if len(group_data) == 0: + continue + + total_weight = group_data['weight'].sum() + avg_tax_change = (group_data['tax_change'] * group_data['weight']).sum() / total_weight + avg_income_change_pct = (group_data['income_change_pct'] * group_data['weight']).sum() / total_weight + + results.append({ + 'group': group, + 'pe_tax_change': round(avg_tax_change), + 'pe_pct_change': round(avg_income_change_pct, 1), + }) + + return pd.DataFrame(results), revenue_impact + +def generate_comparison_table(pe_results, year): + """Generate comparison table with Wharton benchmark""" + + if year not in WHARTON_BENCHMARKS: + print(f"Warning: No Wharton benchmark available for year {year}") + return pe_results + + wharton_data = WHARTON_BENCHMARKS[year] + + comparison = [] + for _, row in pe_results.iterrows(): + group = row['group'] + wharton = wharton_data.get(group, {'tax_change': None, 'pct_change': None}) + + pe_tax = row['pe_tax_change'] + wh_tax = wharton['tax_change'] + + comparison.append({ + 'Income Group': group, + 'PolicyEngine': f"${pe_tax:,}", + 'Wharton': f"${wh_tax:,}" if wh_tax is not None else 'N/A', + 'Difference': f"${(pe_tax - wh_tax):,}" if wh_tax is not None else 'N/A', + 'PE %': f"{row['pe_pct_change']}%", + 'Wharton %': f"{wharton['pct_change']}%" if wharton['pct_change'] is not None else 'N/A', + }) + + return pd.DataFrame(comparison) + +# Example usage: +# dataset_path = 'hf://policyengine/test/2054.h5' +# year = 2054 +# income_rank_variable = "household_net_income" +# pe_results, revenue_impact = run_analysis(dataset_path, year, income_rank_variable) +# comparison_table = generate_comparison_table(pe_results, year) +``` diff --git a/docs/myst.yml b/docs/myst.yml index a39b3cfb..fbbdcbd9 100644 --- a/docs/myst.yml +++ b/docs/myst.yml @@ -24,7 +24,7 @@ project: - file: background.md - file: data.md - file: methodology.md - - file: long_term_projections.ipynb + - file: long_term_projections.md - file: discussion.md - file: conclusion.md - file: appendix.md From 88749b0405dbed9b6b0384ced929fbd501ec6835 Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Tue, 9 Dec 2025 13:18:56 -0500 Subject: [PATCH 05/10] Use PUMA-level geographic adjustments for ACS SPM thresholds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, ACS SPM thresholds used national-level values only (GEOADJ=1.0). This update uses spm-calculator's PUMA-level geographic adjustment lookup to compute thresholds with local housing cost adjustments. Changes: - Add calculate_spm_thresholds_by_puma() function to spm.py - Update ACS dataset to pass state FIPS and PUMA codes to the new function - PUMA identifiers are constructed as 7-digit codes (2-digit state + 5-digit PUMA) Note: Requires CENSUS_API_KEY environment variable for PUMA lookups. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- policyengine_us_data/datasets/acs/acs.py | 13 ++++-- policyengine_us_data/utils/spm.py | 55 ++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/policyengine_us_data/datasets/acs/acs.py b/policyengine_us_data/datasets/acs/acs.py index 836c2a1f..d5ae6402 100644 --- a/policyengine_us_data/datasets/acs/acs.py +++ b/policyengine_us_data/datasets/acs/acs.py @@ -101,7 +101,7 @@ def add_spm_variables( time_period: int, ) -> None: from policyengine_us_data.utils.spm import ( - calculate_spm_thresholds_national, + calculate_spm_thresholds_by_puma, map_tenure_acs_to_spm, ) @@ -133,12 +133,17 @@ def add_spm_variables( # Map ACS tenure to SPM tenure codes tenure_codes = map_tenure_acs_to_spm(household_indexed["TEN"].values) - # Calculate SPM thresholds using national-level values - # (ACS doesn't have Census-provided geographic adjustments) - acs["spm_unit_spm_threshold"] = calculate_spm_thresholds_national( + # Get geographic identifiers for PUMA-level adjustments + state_fips = household_indexed["ST"].values.astype(int) + puma_codes = household_indexed["PUMA"].values.astype(int) + + # Calculate SPM thresholds using PUMA-level geographic adjustments + acs["spm_unit_spm_threshold"] = calculate_spm_thresholds_by_puma( num_adults=num_adults, num_children=num_children, tenure_codes=tenure_codes, + state_fips=state_fips, + puma_codes=puma_codes, year=time_period, ) diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py index 05c04234..a8c552cb 100644 --- a/policyengine_us_data/utils/spm.py +++ b/policyengine_us_data/utils/spm.py @@ -98,6 +98,61 @@ def calculate_spm_thresholds_national( return thresholds +def calculate_spm_thresholds_by_puma( + num_adults: np.ndarray, + num_children: np.ndarray, + tenure_codes: np.ndarray, + state_fips: np.ndarray, + puma_codes: np.ndarray, + year: int, +) -> np.ndarray: + """ + Calculate SPM thresholds using PUMA-level geographic adjustments. + + This function uses the spm-calculator's PUMA-level geographic adjustment + lookup to compute thresholds with local housing cost adjustments. + Requires CENSUS_API_KEY environment variable to be set. + + Args: + num_adults: Array of number of adults (18+) in each SPM unit. + num_children: Array of number of children (<18) in each SPM unit. + tenure_codes: Array of Census tenure/mortgage status codes. + 1 = owner with mortgage, 2 = owner without mortgage, 3 = renter. + state_fips: Array of 2-digit state FIPS codes (integers). + puma_codes: Array of 5-digit PUMA codes (integers). + year: The year for which to calculate thresholds. + + Returns: + Array of SPM threshold values with PUMA-level geographic adjustments. + """ + calc = SPMCalculator(year=year) + + # Build 7-digit PUMA identifiers: 2-digit state + 5-digit PUMA + # Format as strings with proper zero-padding + puma_ids = np.array( + [ + f"{int(state):02d}{int(puma):05d}" + for state, puma in zip(state_fips, puma_codes) + ] + ) + + # Map tenure codes to strings + tenure_strs = np.array( + [TENURE_CODE_MAP.get(int(t), "renter") for t in tenure_codes] + ) + + # Use vectorized calculation from spm-calculator + thresholds = calc.calculate_thresholds( + num_adults=num_adults.astype(int), + num_children=num_children.astype(int), + tenure=tenure_strs, + geography_type="puma", + geography_ids=puma_ids, + ) + + return thresholds + + def map_tenure_acs_to_spm(tenure_type: np.ndarray) -> np.ndarray: """ Map ACS tenure type values to spm-calculator tenure codes. From 23fce035a345fa5cd45c040556afeba210c86e45 Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Wed, 10 Dec 2025 13:52:26 -0500 Subject: [PATCH 06/10] Use national SPM thresholds for ACS (no geographic adjustment) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Switch from calculate_spm_thresholds_by_puma to calculate_spm_thresholds_national to avoid Census API dependency that was causing CI failures - Geographic adjustments will be applied later in the pipeline when households are assigned to specific areas (per discussion with Max) - Remove obsolete long_term_projections.ipynb reference from myst.yml 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- docs/myst.yml | 1 - policyengine_us_data/datasets/acs/acs.py | 14 +++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/docs/myst.yml b/docs/myst.yml index a07556cd..c38666af 100644 --- a/docs/myst.yml +++ b/docs/myst.yml @@ -26,7 +26,6 @@ project: - file: methodology.md - file: long_term_projections.md - file: local_area_calibration_setup.ipynb - - file: long_term_projections.ipynb - file: discussion.md - file: conclusion.md - file: appendix.md diff --git a/policyengine_us_data/datasets/acs/acs.py b/policyengine_us_data/datasets/acs/acs.py index d5ae6402..df227bd9 100644 --- a/policyengine_us_data/datasets/acs/acs.py +++ b/policyengine_us_data/datasets/acs/acs.py @@ -101,7 +101,7 @@ def add_spm_variables( time_period: int, ) -> None: from policyengine_us_data.utils.spm import ( - calculate_spm_thresholds_by_puma, + calculate_spm_thresholds_national, map_tenure_acs_to_spm, ) @@ -133,17 +133,13 @@ def add_spm_variables( # Map ACS tenure to SPM tenure codes tenure_codes = map_tenure_acs_to_spm(household_indexed["TEN"].values) - # Get geographic identifiers for PUMA-level adjustments - state_fips = household_indexed["ST"].values.astype(int) - puma_codes = household_indexed["PUMA"].values.astype(int) - - # Calculate SPM thresholds using PUMA-level geographic adjustments - acs["spm_unit_spm_threshold"] = calculate_spm_thresholds_by_puma( + # Calculate SPM thresholds using national-level values (no geographic + # adjustment). Geographic adjustments will be applied later in the + # pipeline when households are assigned to specific areas. + acs["spm_unit_spm_threshold"] = calculate_spm_thresholds_national( num_adults=num_adults, num_children=num_children, tenure_codes=tenure_codes, - state_fips=state_fips, - puma_codes=puma_codes, year=time_period, ) From ea3391dd7b9d6a27128f93eea9d9b777fc7ebf01 Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Fri, 12 Dec 2025 12:02:57 -0500 Subject: [PATCH 07/10] Address PR review feedback: revert ACS changes and simplify CPS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Revert all ACS changes - no longer touching ACS since SPM_POVTHRESHOLD is already available and we don't need to calculate thresholds there - Remove time_period argument from add_spm_variables, use self.time_period instead to match pattern of other functions like add_rent 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- policyengine_us_data/datasets/acs/acs.py | 52 ++---------------------- policyengine_us_data/datasets/cps/cps.py | 10 ++--- 2 files changed, 6 insertions(+), 56 deletions(-) diff --git a/policyengine_us_data/datasets/acs/acs.py b/policyengine_us_data/datasets/acs/acs.py index df227bd9..0ecd3ee7 100644 --- a/policyengine_us_data/datasets/acs/acs.py +++ b/policyengine_us_data/datasets/acs/acs.py @@ -25,7 +25,6 @@ def generate(self) -> None: self.add_id_variables(acs, person, household) self.add_person_variables(acs, person, household) self.add_household_variables(acs, household) - self.add_spm_variables(acs, person, household, self.time_period) acs.close() raw_data.close() @@ -94,54 +93,9 @@ def add_person_variables( ) @staticmethod - def add_spm_variables( - acs: h5py.File, - person: DataFrame, - household: DataFrame, - time_period: int, - ) -> None: - from policyengine_us_data.utils.spm import ( - calculate_spm_thresholds_national, - map_tenure_acs_to_spm, - ) - - # In ACS, SPM unit = household - # Calculate number of adults (18+) and children (<18) per household - person_with_hh = person.copy() - person_with_hh["is_adult"] = person_with_hh["AGEP"] >= 18 - person_with_hh["is_child"] = person_with_hh["AGEP"] < 18 - - hh_counts = ( - person_with_hh.groupby("household_id") - .agg({"is_adult": "sum", "is_child": "sum"}) - .rename( - columns={"is_adult": "num_adults", "is_child": "num_children"} - ) - ) - - # Ensure household is indexed properly - household_indexed = household.set_index("household_id") - - # Get counts aligned with household order - num_adults = hh_counts.loc[ - household_indexed.index, "num_adults" - ].values - num_children = hh_counts.loc[ - household_indexed.index, "num_children" - ].values - - # Map ACS tenure to SPM tenure codes - tenure_codes = map_tenure_acs_to_spm(household_indexed["TEN"].values) - - # Calculate SPM thresholds using national-level values (no geographic - # adjustment). Geographic adjustments will be applied later in the - # pipeline when households are assigned to specific areas. - acs["spm_unit_spm_threshold"] = calculate_spm_thresholds_national( - num_adults=num_adults, - num_children=num_children, - tenure_codes=tenure_codes, - year=time_period, - ) + def add_spm_variables(acs: h5py.File, spm_unit: DataFrame) -> None: + acs["spm_unit_net_income_reported"] = spm_unit.SPM_RESOURCES + acs["spm_unit_spm_threshold"] = spm_unit.SPM_POVTHRESHOLD @staticmethod def add_household_variables(acs: h5py.File, household: DataFrame) -> None: diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 8bbf5746..9b09a38f 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -78,7 +78,7 @@ def generate(self): undocumented_students_target=0.21 * 1.9e6, ) logging.info("Adding family variables") - add_spm_variables(cps, spm_unit, self.time_period) + add_spm_variables(self, cps, spm_unit) logging.info("Adding household variables") add_household_variables(cps, household) logging.info("Adding rent") @@ -602,11 +602,7 @@ def add_personal_income_variables( cps[f"{var}_would_be_qualified"] = rng.random(len(person)) < prob -def add_spm_variables( - cps: h5py.File, - spm_unit: DataFrame, - time_period: int, -) -> None: +def add_spm_variables(self, cps: h5py.File, spm_unit: DataFrame) -> None: from policyengine_us_data.utils.spm import ( calculate_spm_thresholds_with_geoadj, ) @@ -639,7 +635,7 @@ def add_spm_variables( num_children=spm_unit["SPM_NUMKIDS"].values, tenure_codes=spm_unit["SPM_TENMORTSTATUS"].values, geoadj=spm_unit["SPM_GEOADJ"].values, - year=time_period, + year=self.time_period, ) cps["reduced_price_school_meals_reported"] = ( From 0addc175e0838d35ba40090104abea54a951725d Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Fri, 12 Dec 2025 12:08:38 -0500 Subject: [PATCH 08/10] Remove redundant comments and unused functions from spm.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove redundant comment above TENURE_CODE_MAP (self-explanatory) - Remove calculate_spm_thresholds_national (unused after ACS revert) - Remove map_tenure_acs_to_spm (unused after ACS revert) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- policyengine_us_data/utils/spm.py | 69 ------------------------------- 1 file changed, 69 deletions(-) diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py index a8c552cb..6211489c 100644 --- a/policyengine_us_data/utils/spm.py +++ b/policyengine_us_data/utils/spm.py @@ -4,11 +4,6 @@ from spm_calculator import SPMCalculator, spm_equivalence_scale -# Census CPS SPM_TENMORTSTATUS codes to spm-calculator tenure mapping -# Based on IPUMS SPMMORT documentation: -# 1 = Owner with mortgage -# 2 = Owner without mortgage -# 3 = Renter TENURE_CODE_MAP = { 1: "owner_with_mortgage", 2: "owner_without_mortgage", @@ -58,46 +53,6 @@ def calculate_spm_thresholds_with_geoadj( return thresholds -def calculate_spm_thresholds_national( - num_adults: np.ndarray, - num_children: np.ndarray, - tenure_codes: np.ndarray, - year: int, -) -> np.ndarray: - """ - Calculate SPM thresholds using national-level thresholds (no geoadj). - - This is used for datasets like ACS that don't have pre-computed - geographic adjustment factors. - - Args: - num_adults: Array of number of adults (18+) in each SPM unit. - num_children: Array of number of children (<18) in each SPM unit. - tenure_codes: Array of Census tenure/mortgage status codes. - 1 = owner with mortgage, 2 = owner without mortgage, 3 = renter. - year: The year for which to calculate thresholds. - - Returns: - Array of SPM threshold values using national averages. - """ - calc = SPMCalculator(year=year) - base_thresholds = calc.get_base_thresholds() - - n = len(num_adults) - thresholds = np.zeros(n) - - for i in range(n): - tenure_str = TENURE_CODE_MAP.get(int(tenure_codes[i]), "renter") - base = base_thresholds[tenure_str] - equiv_scale = spm_equivalence_scale( - int(num_adults[i]), int(num_children[i]) - ) - # No geographic adjustment for national-level thresholds - thresholds[i] = base * equiv_scale - - return thresholds - - def calculate_spm_thresholds_by_puma( num_adults: np.ndarray, num_children: np.ndarray, @@ -151,27 +106,3 @@ def calculate_spm_thresholds_by_puma( ) return thresholds - - -def map_tenure_acs_to_spm(tenure_type: np.ndarray) -> np.ndarray: - """ - Map ACS tenure type values to spm-calculator tenure codes. - - Args: - tenure_type: Array of ACS TEN values. - 1 = Owned with mortgage/loan - 2 = Owned free and clear - 3 = Rented - - Returns: - Array of tenure code integers matching Census SPM format. - """ - # ACS TEN codes map directly to Census SPM codes: - # ACS 1 (owned with mortgage) -> Census 1 (owner_with_mortgage) - # ACS 2 (owned outright) -> Census 2 (owner_without_mortgage) - # ACS 3 (rented) -> Census 3 (renter) - return np.where( - np.isin(tenure_type, [1, 2, 3]), - tenure_type, - 3, # Default to renter for unknown values - ) From ecde9650576aecc2235c2d12fe6780920ccf4588 Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Fri, 12 Dec 2025 12:09:54 -0500 Subject: [PATCH 09/10] Remove unused calculate_spm_thresholds_by_puma function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- policyengine_us_data/utils/spm.py | 55 ------------------------------- 1 file changed, 55 deletions(-) diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py index 6211489c..070db533 100644 --- a/policyengine_us_data/utils/spm.py +++ b/policyengine_us_data/utils/spm.py @@ -51,58 +51,3 @@ def calculate_spm_thresholds_with_geoadj( thresholds[i] = base * equiv_scale * geoadj[i] return thresholds - - -def calculate_spm_thresholds_by_puma( - num_adults: np.ndarray, - num_children: np.ndarray, - tenure_codes: np.ndarray, - state_fips: np.ndarray, - puma_codes: np.ndarray, - year: int, -) -> np.ndarray: - """ - Calculate SPM thresholds using PUMA-level geographic adjustments. - - This function uses the spm-calculator's PUMA-level geographic adjustment - lookup to compute thresholds with local housing cost adjustments. - Requires CENSUS_API_KEY environment variable to be set. - - Args: - num_adults: Array of number of adults (18+) in each SPM unit. - num_children: Array of number of children (<18) in each SPM unit. - tenure_codes: Array of Census tenure/mortgage status codes. - 1 = owner with mortgage, 2 = owner without mortgage, 3 = renter. - state_fips: Array of 2-digit state FIPS codes (integers). - puma_codes: Array of 5-digit PUMA codes (integers). - year: The year for which to calculate thresholds. - - Returns: - Array of SPM threshold values with PUMA-level geographic adjustments. - """ - calc = SPMCalculator(year=year) - - # Build 7-digit PUMA identifiers: 2-digit state + 5-digit PUMA - # Format as strings with proper zero-padding - puma_ids = np.array( - [ - f"{int(state):02d}{int(puma):05d}" - for state, puma in zip(state_fips, puma_codes) - ] - ) - - # Map tenure codes to strings - tenure_strs = np.array( - [TENURE_CODE_MAP.get(int(t), "renter") for t in tenure_codes] - ) - - # Use vectorized calculation from spm-calculator - thresholds = calc.calculate_thresholds( - num_adults=num_adults.astype(int), - num_children=num_children.astype(int), - tenure=tenure_strs, - geography_type="puma", - geography_ids=puma_ids, - ) - - return thresholds From 3c000981a112883c5386040fc4ef8e67eaf9eb13 Mon Sep 17 00:00:00 2001 From: David Trimmer Date: Fri, 19 Dec 2025 12:17:34 -0500 Subject: [PATCH 10/10] switch to PyPi --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 06a7af91..14acb68e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ dependencies = [ "sqlalchemy>=2.0.41", "sqlmodel>=0.0.24", "xlrd>=2.0.2", - "spm-calculator @ git+https://github.com/PolicyEngine/spm-calculator.git", + "spm-calculator>=0.1.0", ] [project.optional-dependencies]