diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index 6b474227..b752e953 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -2,6 +2,7 @@ name: Code changes on: + workflow_call: push: branches: - main @@ -27,7 +28,7 @@ jobs: contents: "read" # Required to auth against gcp id-token: "write" - runs-on: larger-runner + runs-on: ubuntu-latest steps: - name: Checkout repo uses: actions/checkout@v2 diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml deleted file mode 100644 index a2daca18..00000000 --- a/.github/workflows/manual_tests.yaml +++ /dev/null @@ -1,17 +0,0 @@ -name: Manual tests - -on: - workflow_dispatch: - inputs: - test_lite: - description: 'Run in lite mode' - required: true - default: true - type: boolean - -jobs: - test: - uses: ./.github/workflows/pr_changelog.yaml - with: - TEST_LITE: ${{ github.event.inputs.test_lite }} - secrets: inherit diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 213d192f..4e30d089 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -49,7 +49,7 @@ jobs: run: python -c "from policyengine_core.data import Dataset; print('Core import OK')" Test: - runs-on: larger-runner + runs-on: ubuntu-latest needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} diff --git a/.gitignore b/.gitignore index a66fdef5..b65290c2 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ **/*.h5 **/*.csv !healthcare_spending.csv +!medicaid_enrollment_2024.csv !eitc.csv !spm_threshold_agi.csv **/_build diff --git a/CHANGELOG.md b/CHANGELOG.md index e8b2a506..e355d4dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,28 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.37.1] - 2025-07-14 15:33:11 + +### Changed + +- bad targets (causing problems with estimation) removed +- lite mode now builds CPS_2023 in addition to CPS_2024 +- gave reweight an epochs argument and set it at 150 for optimization +- updating minimum versions on policyengine-us and pandas dependencies +- getting rid of non-working manual workflow code + +## [1.37.0] - 2025-07-09 14:58:33 + +### Added + +- Medicaid state level calibration targets. + +## [1.36.2] - 2025-07-08 21:53:02 + +### Fixed + +- Use SURVIVING_SPOUSE and is_surviving_spouse instead of WIDOW and is_widowed. + ## [1.36.1] - 2025-07-03 09:21:06 ### Changed @@ -508,6 +530,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +[1.37.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.37.0...1.37.1 +[1.37.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.2...1.37.0 +[1.36.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.1...1.36.2 [1.36.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.0...1.36.1 [1.36.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.35.2...1.36.0 [1.35.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.35.1...1.35.2 diff --git a/changelog.yaml b/changelog.yaml index 2f07e429..af7cdf32 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -423,3 +423,22 @@ changed: - PR tests to be more similar to production builds. date: 2025-07-03 09:21:06 +- bump: patch + changes: + fixed: + - Use SURVIVING_SPOUSE and is_surviving_spouse instead of WIDOW and is_widowed. + date: 2025-07-08 21:53:02 +- bump: minor + changes: + added: + - Medicaid state level calibration targets. + date: 2025-07-09 14:58:33 +- bump: patch + changes: + changed: + - bad targets (causing problems with estimation) removed + - lite mode now builds CPS_2023 in addition to CPS_2024 + - gave reweight an epochs argument and set it at 150 for optimization + - updating minimum versions on policyengine-us and pandas dependencies + - getting rid of non-working manual workflow code + date: 2025-07-14 15:33:11 diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index f6fd0e75..d9957cbb 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -100,7 +100,6 @@ def downsample(self, frac: float): original_dtypes = { key: original_data[key].dtype for key in original_data } - sim = Microsimulation(dataset=self) sim.subsample(frac=frac) @@ -356,7 +355,7 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["cps_race"] = person.PRDTRACE cps["is_hispanic"] = person.PRDTHSP != 0 - cps["is_widowed"] = person.A_MARITL == 4 + cps["is_surviving_spouse"] = person.A_MARITL == 4 cps["is_separated"] = person.A_MARITL == 6 # High school or college/university enrollment status. cps["is_full_time_college_student"] = person.A_HSCOL == 2 @@ -2006,6 +2005,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if __name__ == "__main__": if test_lite: + CPS_2023().generate() CPS_2024().generate() else: CPS_2021().generate() diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index b8af12ce..7a471d40 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -28,6 +28,7 @@ def reweight( targets_array, dropout_rate=0.05, log_path="calibration_log.csv", + epochs=150, ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -45,7 +46,7 @@ def reweight( np.log(original_weights), requires_grad=True, dtype=torch.float32 ) - # TODO: replace this with a call to the python reweight.py package. + # TODO: replace this functionality from the microcalibrate package. def loss(weights): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): @@ -78,7 +79,7 @@ def dropout_weights(weights, p): start_loss = None - iterator = trange(500) + iterator = trange(epochs) performance = pd.DataFrame() for i in iterator: optimizer.zero_grad() @@ -178,18 +179,71 @@ def generate(self): original_weights = original_weights.values + np.random.normal( 1, 0.1, len(original_weights) ) + + bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "state/RI/adjusted_gross_income/amount/-inf_1", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "state/RI/adjusted_gross_income/amount/-inf_1", + "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", + ] + + # Run the optimization procedure to get (close to) minimum loss weights for year in range(self.start_year, self.end_year + 1): loss_matrix, targets_array = build_loss_matrix( self.input_dataset, year ) + zero_mask = np.isclose(targets_array, 0.0, atol=0.1) + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~(zero_mask | bad_mask) + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_array_clean = targets_array[keep_idx] + assert loss_matrix_clean.shape[1] == targets_array_clean.size + optimised_weights = reweight( original_weights, - loss_matrix, - targets_array, + loss_matrix_clean, + targets_array_clean, log_path="calibration_log.csv", + epochs=150, ) data["household_weight"][year] = optimised_weights + print("\n\n---reweighting quick diagnostics----\n") + estimate = optimised_weights @ loss_matrix_clean + rel_error = ( + ((estimate - targets_array_clean) + 1) + / (targets_array_clean + 1) + ) ** 2 + print( + f"rel_error: min: {np.min(rel_error):.2f}, " + f"max: {np.max(rel_error):.2f} " + f"mean: {np.mean(rel_error):.2f}, " + f"median: {np.median(rel_error):.2f}" + ) + print("Relative error over 100% for:") + for i in np.where(rel_error > 1)[0]: + print(f"target_name: {loss_matrix_clean.columns[i]}") + print(f"target_value: {targets_array_clean[i]}") + print(f"estimate_value: {estimate[i]}") + print(f"has rel_error: {rel_error[i]:.2f}\n") + print("---End of reweighting quick diagnostics------") + self.save_dataset(data) diff --git a/policyengine_us_data/storage/README.md b/policyengine_us_data/storage/README.md new file mode 100644 index 00000000..55f98ed9 --- /dev/null +++ b/policyengine_us_data/storage/README.md @@ -0,0 +1,11 @@ +# storage/ datasets + +- **aca_spending_and_enrollment_2024.csv** + • Source: CMS Marketplace Public Use File, 2024 open-enrollment + • Date: 2024 + • Location: https://www.cms.gov/files/document/health-insurance-exchanges-2024-open-enrollment-report-final.pdf + +- **medicaid_enrollment_2024.csv** + • Source: MACPAC Enrollment Tables, FFY 2024 + • Date: 2024 + • Location: https://www.medicaid.gov/resources-for-states/downloads/eligib-oper-and-enrol-snap-december2024.pdf#page=26 diff --git a/policyengine_us_data/storage/medicaid_enrollment_2024.csv b/policyengine_us_data/storage/medicaid_enrollment_2024.csv new file mode 100644 index 00000000..108670c5 --- /dev/null +++ b/policyengine_us_data/storage/medicaid_enrollment_2024.csv @@ -0,0 +1,52 @@ +state,enrollment +AK,231577 +AL,766009 +AR,733561 +AZ,1778734 +CA,12172695 +CO,1058326 +CT,904321 +DC,240020 +DE,236840 +FL,3568648 +GA,1699279 +HI,376318 +IA,586748 +ID,296968 +IL,2918179 +IN,1623361 +KS,335902 +KY,1244822 +LA,1377806 +MA,1453344 +MD,1280697 +ME,322306 +MI,2194067 +MN,1146667 +MO,1118780 +MS,514730 +MT,193278 +NC,2469712 +ND,100543 +NE,302971 +NH,166813 +NJ,1506239 +NM,686825 +NV,713936 +NY,5946806 +OH,2596879 +OK,894911 +OR,1123313 +PA,2783389 +RI,273400 +SC,932515 +SD,126952 +TN,1268904 +TX,3821806 +UT,300742 +VA,1596777 +VT,151833 +WA,1776116 +WI,1108320 +WV,467632 +WY,57320 diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index cdb47667..abf67301 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -214,3 +214,43 @@ def test_aca_calibration(): assert ( not failed ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}." + + +def test_medicaid_calibration(): + + import pandas as pd + from pathlib import Path + from policyengine_us import Microsimulation + from policyengine_us_data.datasets.cps import EnhancedCPS_2024 + + TARGETS_PATH = Path( + "policyengine_us_data/storage/medicaid_enrollment_2024.csv" + ) + targets = pd.read_csv(TARGETS_PATH) + + sim = Microsimulation(dataset=EnhancedCPS_2024) + state_code_hh = sim.calculate("state_code", map_to="household").values + medicaid_enrolled = sim.calculate( + "medicaid_enrolled", map_to="household", period=2025 + ) + + TOLERANCE = 0.45 + failed = False + for _, row in targets.iterrows(): + state = row["state"] + target_enrollment = row["enrollment"] + simulated = medicaid_enrolled[state_code_hh == state].sum() + + pct_error = abs(simulated - target_enrollment) / target_enrollment + print( + f"{state}: simulated ${simulated/1e9:.2f} bn " + f"target ${target_enrollment/1e9:.2f} bn " + f"error {pct_error:.2%}" + ) + + if pct_error > TOLERANCE: + failed = True + + assert ( + not failed + ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}." diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 9e3cdc60..21abce0f 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -519,6 +519,44 @@ def build_loss_matrix(dataset: type, time_period): # Convert to thousands for the target targets_array.append(row["enrollment"]) + # Medicaid enrollment by state + + enrollment_by_state = pd.read_csv( + STORAGE_FOLDER / "medicaid_enrollment_2024.csv" + ) + + # One-time pulls so we don’t re-compute inside the loop + state_person = sim.calculate("state_code", map_to="person").values + + # Flag people in households that actually receive medicaid + has_medicaid = sim.calculate( + "medicaid_enrolled", map_to="person", period=2025 + ) + is_medicaid_eligible = sim.calculate( + "is_medicaid_eligible", map_to="person", period=2025 + ).values + is_enrolled = has_medicaid & is_medicaid_eligible + + for _, row in enrollment_by_state.iterrows(): + # People who both live in the state and have marketplace coverage + in_state = state_person == row["state"] + in_state_enrolled = in_state & is_enrolled + + label = f"irs/medicaid_enrollment/{row['state'].lower()}" + loss_matrix[label] = sim.map_result( + in_state_enrolled, "person", "household" + ) + if any(loss_matrix[label].isna()): + raise ValueError(f"Missing values for {label}") + + # Convert to thousands for the target + targets_array.append(row["enrollment"]) + + print( + f"Targeting Medicaid enrollment for {row['state']} " + f"with target {row['enrollment']:.0f}k" + ) + # State 10-year age targets age_targets = pd.read_csv(STORAGE_FOLDER / "age_state.csv") diff --git a/policyengine_us_data/utils/soi.py b/policyengine_us_data/utils/soi.py index 9d53a142..a34babf6 100644 --- a/policyengine_us_data/utils/soi.py +++ b/policyengine_us_data/utils/soi.py @@ -218,7 +218,9 @@ def compare_soi_replication_to_soi(df, soi): elif fs == "Head of Household": subset = subset[subset.filing_status == "HEAD_OF_HOUSEHOLD"] elif fs == "Married Filing Jointly/Surviving Spouse": - subset = subset[subset.filing_status.isin(["JOINT", "WIDOW"])] + subset = subset[ + subset.filing_status.isin(["JOINT", "SURVIVING_SPOUSE"]) + ] elif fs == "Married Filing Separately": subset = subset[subset.filing_status == "SEPARATE"] diff --git a/pyproject.toml b/pyproject.toml index 2f8ca522..5a75693f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "policyengine_us_data" -version = "1.36.1" +version = "1.37.1" description = "A package to create representative microdata for the US." readme = "README.md" authors = [ @@ -17,8 +17,9 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us>=1.197.0", - "policyengine-core>=3.14.1", + "policyengine-us>=1.340.1", + "policyengine-core>=3.17.1", + "pandas>=2.3.0", "requests", "tqdm", "microdf_python>=0.4.3",