PolicyEngine · daphnehanse11 · Jul 3, 2025 · Jul 3, 2025 · Jul 3, 2025 · Jul 3, 2025
diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml
@@ -2,6 +2,7 @@
 
 name: Code changes
 on:
+  workflow_call:
   push:
     branches:
       - main
@@ -27,7 +28,7 @@ jobs:
         contents: "read"
         # Required to auth against gcp
         id-token: "write"
-      runs-on: larger-runner
+      runs-on: ubuntu-latest
       steps:
           - name: Checkout repo
             uses: actions/checkout@v2

diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml
diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
@@ -49,7 +49,7 @@ jobs:
         run: python -c "from policyengine_core.data import Dataset; print('Core import OK')"
 
   Test:
-      runs-on: larger-runner
+      runs-on: ubuntu-latest
       needs: Lint
       env:
         HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}

diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 **/*.h5
 **/*.csv
 !healthcare_spending.csv
+!medicaid_enrollment_2024.csv
 !eitc.csv
 !spm_threshold_agi.csv
 **/_build

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,28 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.37.1] - 2025-07-14 15:33:11
+
+### Changed
+
+- bad targets (causing problems with estimation) removed
+- lite mode now builds CPS_2023 in addition to CPS_2024
+- gave reweight an epochs argument and set it at 150 for optimization
+- updating minimum versions on policyengine-us and pandas dependencies
+- getting rid of non-working manual workflow code
+
+## [1.37.0] - 2025-07-09 14:58:33
+
+### Added
+
+- Medicaid state level calibration targets.
+
+## [1.36.2] - 2025-07-08 21:53:02
+
+### Fixed
+
+- Use SURVIVING_SPOUSE and is_surviving_spouse instead of WIDOW and is_widowed.
+
 ## [1.36.1] - 2025-07-03 09:21:06
 
 ### Changed
@@ -508,6 +530,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[1.37.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.37.0...1.37.1
+[1.37.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.2...1.37.0
+[1.36.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.1...1.36.2
 [1.36.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.0...1.36.1
 [1.36.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.35.2...1.36.0
 [1.35.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.35.1...1.35.2

diff --git a/changelog.yaml b/changelog.yaml
@@ -423,3 +423,22 @@
     changed:
     - PR tests to be more similar to production builds.
   date: 2025-07-03 09:21:06
+- bump: patch
+  changes:
+    fixed:
+    - Use SURVIVING_SPOUSE and is_surviving_spouse instead of WIDOW and is_widowed.
+  date: 2025-07-08 21:53:02
+- bump: minor
+  changes:
+    added:
+    - Medicaid state level calibration targets.
+  date: 2025-07-09 14:58:33
+- bump: patch
+  changes:
+    changed:
+    - bad targets (causing problems with estimation) removed
+    - lite mode now builds CPS_2023 in addition to CPS_2024
+    - gave reweight an epochs argument and set it at 150 for optimization
+    - updating minimum versions on policyengine-us and pandas dependencies
+    - getting rid of non-working manual workflow code
+  date: 2025-07-14 15:33:11
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -100,7 +100,6 @@ def downsample(self, frac: float):
         original_dtypes = {
             key: original_data[key].dtype for key in original_data
         }
-
         sim = Microsimulation(dataset=self)
         sim.subsample(frac=frac)
 
@@ -356,7 +355,7 @@ def children_per_parent(col: str) -> pd.DataFrame:
     cps["cps_race"] = person.PRDTRACE
     cps["is_hispanic"] = person.PRDTHSP != 0
 
-    cps["is_widowed"] = person.A_MARITL == 4
+    cps["is_surviving_spouse"] = person.A_MARITL == 4
     cps["is_separated"] = person.A_MARITL == 6
     # High school or college/university enrollment status.
     cps["is_full_time_college_student"] = person.A_HSCOL == 2
@@ -2006,6 +2005,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
 
 if __name__ == "__main__":
     if test_lite:
+        CPS_2023().generate()
         CPS_2024().generate()
     else:
         CPS_2021().generate()

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -28,6 +28,7 @@ def reweight(
     targets_array,
     dropout_rate=0.05,
     log_path="calibration_log.csv",
+    epochs=150,
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -45,7 +46,7 @@ def reweight(
         np.log(original_weights), requires_grad=True, dtype=torch.float32
     )
 
-    # TODO: replace this with a call to the python reweight.py package.
+    # TODO: replace this functionality from the microcalibrate package.
     def loss(weights):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
@@ -78,7 +79,7 @@ def dropout_weights(weights, p):
 
     start_loss = None
 
-    iterator = trange(500)
+    iterator = trange(epochs)
     performance = pd.DataFrame()
     for i in iterator:
         optimizer.zero_grad()
@@ -178,18 +179,71 @@ def generate(self):
         original_weights = original_weights.values + np.random.normal(
             1, 0.1, len(original_weights)
         )
+
+        bad_targets = [
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "state/RI/adjusted_gross_income/amount/-inf_1",
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "state/RI/adjusted_gross_income/amount/-inf_1",
+            "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All",
+        ]
+
+        # Run the optimization procedure to get (close to) minimum loss weights
         for year in range(self.start_year, self.end_year + 1):
             loss_matrix, targets_array = build_loss_matrix(
                 self.input_dataset, year
             )
+            zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
+            bad_mask = loss_matrix.columns.isin(bad_targets)
+            keep_mask_bool = ~(zero_mask | bad_mask)
+            keep_idx = np.where(keep_mask_bool)[0]
+            loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+            targets_array_clean = targets_array[keep_idx]
+            assert loss_matrix_clean.shape[1] == targets_array_clean.size
+
             optimised_weights = reweight(
                 original_weights,
-                loss_matrix,
-                targets_array,
+                loss_matrix_clean,
+                targets_array_clean,
                 log_path="calibration_log.csv",
+                epochs=150,
             )
             data["household_weight"][year] = optimised_weights
 
+            print("\n\n---reweighting quick diagnostics----\n")
+            estimate = optimised_weights @ loss_matrix_clean
+            rel_error = (
+                ((estimate - targets_array_clean) + 1)
+                / (targets_array_clean + 1)
+            ) ** 2
+            print(
+                f"rel_error: min: {np.min(rel_error):.2f}, "
+                f"max: {np.max(rel_error):.2f} "
+                f"mean: {np.mean(rel_error):.2f}, "
+                f"median: {np.median(rel_error):.2f}"
+            )
+            print("Relative error over 100% for:")
+            for i in np.where(rel_error > 1)[0]:
+                print(f"target_name: {loss_matrix_clean.columns[i]}")
+                print(f"target_value: {targets_array_clean[i]}")
+                print(f"estimate_value: {estimate[i]}")
+                print(f"has rel_error: {rel_error[i]:.2f}\n")
+            print("---End of reweighting quick diagnostics------")
+
         self.save_dataset(data)
 
 

diff --git a/policyengine_us_data/storage/README.md b/policyengine_us_data/storage/README.md
@@ -0,0 +1,11 @@
+# storage/ datasets 
+
+- **aca_spending_and_enrollment_2024.csv**  
+  • Source: CMS Marketplace Public Use File, 2024 open-enrollment  
+  • Date: 2024  
+  • Location: https://www.cms.gov/files/document/health-insurance-exchanges-2024-open-enrollment-report-final.pdf
+
+- **medicaid_enrollment_2024.csv**  
+  • Source: MACPAC Enrollment Tables, FFY 2024  
+  • Date: 2024  
+  • Location: https://www.medicaid.gov/resources-for-states/downloads/eligib-oper-and-enrol-snap-december2024.pdf#page=26
diff --git a/policyengine_us_data/storage/medicaid_enrollment_2024.csv b/policyengine_us_data/storage/medicaid_enrollment_2024.csv
@@ -0,0 +1,52 @@
+state,enrollment
+AK,231577
+AL,766009
+AR,733561
+AZ,1778734
+CA,12172695
+CO,1058326
+CT,904321
+DC,240020
+DE,236840
+FL,3568648
+GA,1699279
+HI,376318
+IA,586748
+ID,296968
+IL,2918179
+IN,1623361
+KS,335902
+KY,1244822
+LA,1377806
+MA,1453344
+MD,1280697
+ME,322306
+MI,2194067
+MN,1146667
+MO,1118780
+MS,514730
+MT,193278
+NC,2469712
+ND,100543
+NE,302971
+NH,166813
+NJ,1506239
+NM,686825
+NV,713936
+NY,5946806
+OH,2596879
+OK,894911
+OR,1123313
+PA,2783389
+RI,273400
+SC,932515
+SD,126952
+TN,1268904
+TX,3821806
+UT,300742
+VA,1596777
+VT,151833
+WA,1776116
+WI,1108320
+WV,467632
+WY,57320
diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
@@ -214,3 +214,43 @@ def test_aca_calibration():
     assert (
         not failed
     ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}."
+
+
+def test_medicaid_calibration():
+
+    import pandas as pd
+    from pathlib import Path
+    from policyengine_us import Microsimulation
+    from policyengine_us_data.datasets.cps import EnhancedCPS_2024
+
+    TARGETS_PATH = Path(
+        "policyengine_us_data/storage/medicaid_enrollment_2024.csv"
+    )
+    targets = pd.read_csv(TARGETS_PATH)
+
+    sim = Microsimulation(dataset=EnhancedCPS_2024)
+    state_code_hh = sim.calculate("state_code", map_to="household").values
+    medicaid_enrolled = sim.calculate(
+        "medicaid_enrolled", map_to="household", period=2025
+    )
+
+    TOLERANCE = 0.45
+    failed = False
+    for _, row in targets.iterrows():
+        state = row["state"]
+        target_enrollment = row["enrollment"]
+        simulated = medicaid_enrolled[state_code_hh == state].sum()
+
+        pct_error = abs(simulated - target_enrollment) / target_enrollment
+        print(
+            f"{state}: simulated ${simulated/1e9:.2f} bn  "
+            f"target ${target_enrollment/1e9:.2f} bn  "
+            f"error {pct_error:.2%}"
+        )
+
+        if pct_error > TOLERANCE:
+            failed = True
+
+    assert (
+        not failed
+    ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}."
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
@@ -519,6 +519,44 @@ def build_loss_matrix(dataset: type, time_period):
         # Convert to thousands for the target
         targets_array.append(row["enrollment"])
 
+    # Medicaid enrollment by state
+
+    enrollment_by_state = pd.read_csv(
+        STORAGE_FOLDER / "medicaid_enrollment_2024.csv"
+    )
+
+    # One-time pulls so we don’t re-compute inside the loop
+    state_person = sim.calculate("state_code", map_to="person").values
+
+    # Flag people in households that actually receive medicaid
+    has_medicaid = sim.calculate(
+        "medicaid_enrolled", map_to="person", period=2025
+    )
+    is_medicaid_eligible = sim.calculate(
+        "is_medicaid_eligible", map_to="person", period=2025
+    ).values
+    is_enrolled = has_medicaid & is_medicaid_eligible
+
+    for _, row in enrollment_by_state.iterrows():
+        # People who both live in the state and have marketplace coverage
+        in_state = state_person == row["state"]
+        in_state_enrolled = in_state & is_enrolled
+
+        label = f"irs/medicaid_enrollment/{row['state'].lower()}"
+        loss_matrix[label] = sim.map_result(
+            in_state_enrolled, "person", "household"
+        )
+        if any(loss_matrix[label].isna()):
+            raise ValueError(f"Missing values for {label}")
+
+        # Convert to thousands for the target
+        targets_array.append(row["enrollment"])
+
+        print(
+            f"Targeting Medicaid enrollment for {row['state']} "
+            f"with target {row['enrollment']:.0f}k"
+        )
+
     # State 10-year age targets
 
     age_targets = pd.read_csv(STORAGE_FOLDER / "age_state.csv")