From 35136b803ac204a4cbc69bd3d355914d17075423 Mon Sep 17 00:00:00 2001 From: daphnehanse11 <128793799+daphnehanse11@users.noreply.github.com> Date: Thu, 3 Jul 2025 10:43:45 -0400 Subject: [PATCH 01/34] new branch attempt --- changelog_entry.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..c2032d84 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Medicaid state level calibration targets. \ No newline at end of file From e2729a8709eeb24e4811e12fab719bdd6fd6bd9b Mon Sep 17 00:00:00 2001 From: daphnehanse11 <128793799+daphnehanse11@users.noreply.github.com> Date: Thu, 3 Jul 2025 10:47:44 -0400 Subject: [PATCH 02/34] recreation --- .gitignore | 1 + .../storage/medicaid_enrollment_2024.csv | 52 +++++++++++++++++++ .../tests/test_datasets/test_enhanced_cps.py | 40 ++++++++++++++ policyengine_us_data/utils/loss.py | 37 +++++++++++++ 4 files changed, 130 insertions(+) create mode 100644 policyengine_us_data/storage/medicaid_enrollment_2024.csv diff --git a/.gitignore b/.gitignore index a66fdef5..b65290c2 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ **/*.h5 **/*.csv !healthcare_spending.csv +!medicaid_enrollment_2024.csv !eitc.csv !spm_threshold_agi.csv **/_build diff --git a/policyengine_us_data/storage/medicaid_enrollment_2024.csv b/policyengine_us_data/storage/medicaid_enrollment_2024.csv new file mode 100644 index 00000000..67e115c7 --- /dev/null +++ b/policyengine_us_data/storage/medicaid_enrollment_2024.csv @@ -0,0 +1,52 @@ +state,enrollment +AK,231577 +AL,766009 +AR,733561 +AZ,1778734 +CA,12172695 +CO,1058326 +CT,904321 +DC,240020 +DE,236840 +FL,3568648 +GA,1699279 +HI,376318 +IA,586748 +ID,296968 +IL,2918179 +IN,1623361 +KS,335902 +KY,1244822 +LA,1377806 +MA,1453344 +MD,1280697 +ME,322306 +MI,2194067 +MN,1146667 +MO,1118780 +MS,514730 +MT,193278 +NC,2469712 +ND,100543 +NE,302971 +NH,166813 +NJ,1506239 +NM,686825 +NV,713936 +NY,5946806 +OH,2596879 +OK,894911 +OR,1123313 +PA,2783389 +RI,273400 +SC,932515 +SD,126952 +TN,1268904 +TX,3821806 +UT,300742 +VA,1596777 +VT,151833 +WA,1776116 +WI,1108320 +WV,467632 +WY,57320 \ No newline at end of file diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index cdb47667..6d79d2fc 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -214,3 +214,43 @@ def test_aca_calibration(): assert ( not failed ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}." + + +def test_medicaid_calibration(): + + import pandas as pd + from pathlib import Path + from policyengine_us import Microsimulation + from policyengine_us_data.datasets.cps import EnhancedCPS_2024 + + TARGETS_PATH = Path( + "policyengine_us_data/storage/medicaid_enrollment_2024.csv" + ) + targets = pd.read_csv(TARGETS_PATH) + + sim = Microsimulation(dataset=EnhancedCPS_2024) + state_code_hh = sim.calculate("state_code", map_to="household").values + medicaid_enrolled = sim.calculate( + "medicaid_enrolled", map_to="household", period=2025 + ) + + TOLERANCE = 0.45 + failed = False + for _, row in targets.iterrows(): + state = row["state"] + target_spending = row["spending"] + simulated = medicaid_enrolled[state_code_hh == state].sum() + + pct_error = abs(simulated - target_spending) / target_spending + print( + f"{state}: simulated ${simulated/1e9:.2f} bn " + f"target ${target_spending/1e9:.2f} bn " + f"error {pct_error:.2%}" + ) + + if pct_error > TOLERANCE: + failed = True + + assert ( + not failed + ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}." diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 9e3cdc60..74135a07 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -682,6 +682,43 @@ def _add_agi_metric_columns( loss_matrix[col_name] = metric return loss_matrix + # Medicaid enrollment by state + + enrollment_by_state = pd.read_csv( + STORAGE_FOLDER / "medicaid_enrollment_2024.csv" + ) + + # One-time pulls so we don’t re-compute inside the loop + state_person = sim.calculate("state_code", map_to="person").values + + # Flag people in households that actually receive medicaid + has_medicaid = sim.calculate( + "medicaid_enrolled", map_to="person", period=2025 + ) + is_medicaid_eligible = sim.calculate( + "is_medicaid_eligible", map_to="person", period=2025 + ).values + is_enrolled = has_medicaid & is_medicaid_eligible + + for _, row in enrollment_by_state.iterrows(): + # People who both live in the state and have marketplace coverage + in_state = state_person == row["state"] + in_state_enrolled = in_state & is_enrolled + + label = f"irs/medicaid_enrollment/{row['state'].lower()}" + loss_matrix[label] = sim.map_result( + in_state_enrolled, "person", "household" + ) + if any(loss_matrix[label].isna()): + raise ValueError(f"Missing values for {label}") + + # Convert to thousands for the target + targets_array.append(row["enrollment"]) + + print( + f"Targeting Medicaid enrollment for {row['state']} " + f"with target {row['enrollment']:.0f}k" + ) def _add_state_real_estate_taxes(loss_matrix, targets_list, sim): From 15a6c0a0c5a085da779787ea1ed2f6185282803f Mon Sep 17 00:00:00 2001 From: daphnehanse11 <128793799+daphnehanse11@users.noreply.github.com> Date: Thu, 3 Jul 2025 10:53:44 -0400 Subject: [PATCH 03/34] enrollment --- .../tests/test_datasets/test_enhanced_cps.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index 6d79d2fc..abf67301 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -238,13 +238,13 @@ def test_medicaid_calibration(): failed = False for _, row in targets.iterrows(): state = row["state"] - target_spending = row["spending"] + target_enrollment = row["enrollment"] simulated = medicaid_enrolled[state_code_hh == state].sum() - pct_error = abs(simulated - target_spending) / target_spending + pct_error = abs(simulated - target_enrollment) / target_enrollment print( f"{state}: simulated ${simulated/1e9:.2f} bn " - f"target ${target_spending/1e9:.2f} bn " + f"target ${target_enrollment/1e9:.2f} bn " f"error {pct_error:.2%}" ) From 3b15c495c6dbde9914ed3d2dea1b0aea4be17d5b Mon Sep 17 00:00:00 2001 From: daphnehanse11 <128793799+daphnehanse11@users.noreply.github.com> Date: Thu, 3 Jul 2025 12:30:13 -0400 Subject: [PATCH 04/34] medicaid --- policyengine_us_data/utils/loss.py | 75 +++++++++++++++--------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 74135a07..21abce0f 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -519,6 +519,44 @@ def build_loss_matrix(dataset: type, time_period): # Convert to thousands for the target targets_array.append(row["enrollment"]) + # Medicaid enrollment by state + + enrollment_by_state = pd.read_csv( + STORAGE_FOLDER / "medicaid_enrollment_2024.csv" + ) + + # One-time pulls so we don’t re-compute inside the loop + state_person = sim.calculate("state_code", map_to="person").values + + # Flag people in households that actually receive medicaid + has_medicaid = sim.calculate( + "medicaid_enrolled", map_to="person", period=2025 + ) + is_medicaid_eligible = sim.calculate( + "is_medicaid_eligible", map_to="person", period=2025 + ).values + is_enrolled = has_medicaid & is_medicaid_eligible + + for _, row in enrollment_by_state.iterrows(): + # People who both live in the state and have marketplace coverage + in_state = state_person == row["state"] + in_state_enrolled = in_state & is_enrolled + + label = f"irs/medicaid_enrollment/{row['state'].lower()}" + loss_matrix[label] = sim.map_result( + in_state_enrolled, "person", "household" + ) + if any(loss_matrix[label].isna()): + raise ValueError(f"Missing values for {label}") + + # Convert to thousands for the target + targets_array.append(row["enrollment"]) + + print( + f"Targeting Medicaid enrollment for {row['state']} " + f"with target {row['enrollment']:.0f}k" + ) + # State 10-year age targets age_targets = pd.read_csv(STORAGE_FOLDER / "age_state.csv") @@ -682,43 +720,6 @@ def _add_agi_metric_columns( loss_matrix[col_name] = metric return loss_matrix - # Medicaid enrollment by state - - enrollment_by_state = pd.read_csv( - STORAGE_FOLDER / "medicaid_enrollment_2024.csv" - ) - - # One-time pulls so we don’t re-compute inside the loop - state_person = sim.calculate("state_code", map_to="person").values - - # Flag people in households that actually receive medicaid - has_medicaid = sim.calculate( - "medicaid_enrolled", map_to="person", period=2025 - ) - is_medicaid_eligible = sim.calculate( - "is_medicaid_eligible", map_to="person", period=2025 - ).values - is_enrolled = has_medicaid & is_medicaid_eligible - - for _, row in enrollment_by_state.iterrows(): - # People who both live in the state and have marketplace coverage - in_state = state_person == row["state"] - in_state_enrolled = in_state & is_enrolled - - label = f"irs/medicaid_enrollment/{row['state'].lower()}" - loss_matrix[label] = sim.map_result( - in_state_enrolled, "person", "household" - ) - if any(loss_matrix[label].isna()): - raise ValueError(f"Missing values for {label}") - - # Convert to thousands for the target - targets_array.append(row["enrollment"]) - - print( - f"Targeting Medicaid enrollment for {row['state']} " - f"with target {row['enrollment']:.0f}k" - ) def _add_state_real_estate_taxes(loss_matrix, targets_list, sim): From ee6c850786a75754058b6504a88bad457dcfe341 Mon Sep 17 00:00:00 2001 From: daphnehanse11 <128793799+daphnehanse11@users.noreply.github.com> Date: Mon, 7 Jul 2025 10:10:17 -0400 Subject: [PATCH 05/34] readme --- policyengine_us_data/storage/README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 policyengine_us_data/storage/README.md diff --git a/policyengine_us_data/storage/README.md b/policyengine_us_data/storage/README.md new file mode 100644 index 00000000..88431a8a --- /dev/null +++ b/policyengine_us_data/storage/README.md @@ -0,0 +1,17 @@ +# Sources for datasets + +# title: Medicaid enrollment 2024 +reference: + + +# storage/ datasets + +- **aca_spending_and_enrollment_2024.csv** + • Source: CMS Marketplace Public Use File, 2024 open-enrollment + • Date: 2024 + • Location: https://www.cms.gov/files/document/health-insurance-exchanges-2024-open-enrollment-report-final.pdf + +- **medicaid_enrollment_2024.csv** + • Source: MACPAC Enrollment Tables, FFY 2024 + • Date: 2024 + • Location: `https://www.medicaid.gov/resources-for-states/downloads/eligib-oper-and-enrol-snap-december2024.pdf#page=26 From 98901a0a7be528b59d961aba47aa379c4272bf20 Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Tue, 8 Jul 2025 11:59:00 +0200 Subject: [PATCH 06/34] Use "SURVIVING_SPOUSE" instead of "WIDOW" in `soi.py` Fixes #347 --- changelog_entry.yaml | 4 ++++ policyengine_us_data/datasets/cps/cps.py | 2 +- policyengine_us_data/utils/soi.py | 4 +++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..34cd58d6 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Use SURVIVING_SPOUSE and is_surviving_spouse instead of WIDOW and is_widowed. diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index f6fd0e75..3b976a31 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -356,7 +356,7 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["cps_race"] = person.PRDTRACE cps["is_hispanic"] = person.PRDTHSP != 0 - cps["is_widowed"] = person.A_MARITL == 4 + cps["is_surviving_spouse"] = person.A_MARITL == 4 cps["is_separated"] = person.A_MARITL == 6 # High school or college/university enrollment status. cps["is_full_time_college_student"] = person.A_HSCOL == 2 diff --git a/policyengine_us_data/utils/soi.py b/policyengine_us_data/utils/soi.py index 9d53a142..a34babf6 100644 --- a/policyengine_us_data/utils/soi.py +++ b/policyengine_us_data/utils/soi.py @@ -218,7 +218,9 @@ def compare_soi_replication_to_soi(df, soi): elif fs == "Head of Household": subset = subset[subset.filing_status == "HEAD_OF_HOUSEHOLD"] elif fs == "Married Filing Jointly/Surviving Spouse": - subset = subset[subset.filing_status.isin(["JOINT", "WIDOW"])] + subset = subset[ + subset.filing_status.isin(["JOINT", "SURVIVING_SPOUSE"]) + ] elif fs == "Married Filing Separately": subset = subset[subset.filing_status == "SEPARATE"] From b51f9e3795c771ce2fa5d59a9e3d99fa4e302d04 Mon Sep 17 00:00:00 2001 From: daphnehanse11 <128793799+daphnehanse11@users.noreply.github.com> Date: Tue, 8 Jul 2025 15:40:26 -0400 Subject: [PATCH 07/34] updates --- changelog_entry.yaml | 3 ++- policyengine_us_data/storage/README.md | 8 +------- policyengine_us_data/storage/medicaid_enrollment_2024.csv | 2 +- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index c2032d84..c6f8c35d 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,5 @@ - bump: minor changes: added: - - Medicaid state level calibration targets. \ No newline at end of file + - Medicaid state level calibration targets. + \ No newline at end of file diff --git a/policyengine_us_data/storage/README.md b/policyengine_us_data/storage/README.md index 88431a8a..55f98ed9 100644 --- a/policyengine_us_data/storage/README.md +++ b/policyengine_us_data/storage/README.md @@ -1,9 +1,3 @@ -# Sources for datasets - -# title: Medicaid enrollment 2024 -reference: - - # storage/ datasets - **aca_spending_and_enrollment_2024.csv** @@ -14,4 +8,4 @@ reference: - **medicaid_enrollment_2024.csv** • Source: MACPAC Enrollment Tables, FFY 2024 • Date: 2024 - • Location: `https://www.medicaid.gov/resources-for-states/downloads/eligib-oper-and-enrol-snap-december2024.pdf#page=26 + • Location: https://www.medicaid.gov/resources-for-states/downloads/eligib-oper-and-enrol-snap-december2024.pdf#page=26 diff --git a/policyengine_us_data/storage/medicaid_enrollment_2024.csv b/policyengine_us_data/storage/medicaid_enrollment_2024.csv index 67e115c7..108670c5 100644 --- a/policyengine_us_data/storage/medicaid_enrollment_2024.csv +++ b/policyengine_us_data/storage/medicaid_enrollment_2024.csv @@ -49,4 +49,4 @@ VT,151833 WA,1776116 WI,1108320 WV,467632 -WY,57320 \ No newline at end of file +WY,57320 From 0584a3c7a9609d745104af806ff0b47ac9db0437 Mon Sep 17 00:00:00 2001 From: MaxGhenis Date: Tue, 8 Jul 2025 21:53:05 +0000 Subject: [PATCH 08/34] Update package version --- CHANGELOG.md | 7 +++++++ changelog.yaml | 5 +++++ changelog_entry.yaml | 4 ---- pyproject.toml | 2 +- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8b2a506..4abf90f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.36.2] - 2025-07-08 21:53:02 + +### Fixed + +- Use SURVIVING_SPOUSE and is_surviving_spouse instead of WIDOW and is_widowed. + ## [1.36.1] - 2025-07-03 09:21:06 ### Changed @@ -508,6 +514,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +[1.36.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.1...1.36.2 [1.36.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.0...1.36.1 [1.36.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.35.2...1.36.0 [1.35.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.35.1...1.35.2 diff --git a/changelog.yaml b/changelog.yaml index 2f07e429..453cabf9 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -423,3 +423,8 @@ changed: - PR tests to be more similar to production builds. date: 2025-07-03 09:21:06 +- bump: patch + changes: + fixed: + - Use SURVIVING_SPOUSE and is_surviving_spouse instead of WIDOW and is_widowed. + date: 2025-07-08 21:53:02 diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 34cd58d6..e69de29b 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +0,0 @@ -- bump: patch - changes: - fixed: - - Use SURVIVING_SPOUSE and is_surviving_spouse instead of WIDOW and is_widowed. diff --git a/pyproject.toml b/pyproject.toml index 2f8ca522..833e554a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "policyengine_us_data" -version = "1.36.1" +version = "1.36.2" description = "A package to create representative microdata for the US." readme = "README.md" authors = [ From 31fbfe35d1e6744ea4f744518da7dce6f25e4f19 Mon Sep 17 00:00:00 2001 From: MaxGhenis Date: Wed, 9 Jul 2025 14:58:35 +0000 Subject: [PATCH 09/34] Update package version --- CHANGELOG.md | 7 +++++++ changelog.yaml | 5 +++++ changelog_entry.yaml | 5 ----- pyproject.toml | 2 +- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4abf90f0..6299d8fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.37.0] - 2025-07-09 14:58:33 + +### Added + +- Medicaid state level calibration targets. + ## [1.36.2] - 2025-07-08 21:53:02 ### Fixed @@ -514,6 +520,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +[1.37.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.2...1.37.0 [1.36.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.1...1.36.2 [1.36.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.0...1.36.1 [1.36.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.35.2...1.36.0 diff --git a/changelog.yaml b/changelog.yaml index 453cabf9..699b2430 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -428,3 +428,8 @@ fixed: - Use SURVIVING_SPOUSE and is_surviving_spouse instead of WIDOW and is_widowed. date: 2025-07-08 21:53:02 +- bump: minor + changes: + added: + - Medicaid state level calibration targets. + date: 2025-07-09 14:58:33 diff --git a/changelog_entry.yaml b/changelog_entry.yaml index c6f8c35d..e69de29b 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,5 +0,0 @@ -- bump: minor - changes: - added: - - Medicaid state level calibration targets. - \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 833e554a..0352db69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "policyengine_us_data" -version = "1.36.2" +version = "1.37.0" description = "A package to create representative microdata for the US." readme = "README.md" authors = [ From c621c5efc9c062fc31f9d6e9e06b7297f2e70c93 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 13:44:20 -0400 Subject: [PATCH 10/34] added CPS_2023 to lite mode generation --- changelog_entry.yaml | 4 ++++ policyengine_us_data/datasets/cps/cps.py | 1 + 2 files changed, 5 insertions(+) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..dcce3f1a 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + changed: + - lite mode now builds CPS_2023 in addition to CPS_2024 diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 3b976a31..fde981ba 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2006,6 +2006,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if __name__ == "__main__": if test_lite: + CPS_2023().generate() CPS_2024().generate() else: CPS_2021().generate() From 30846a991a53b7c313e2761bdd01523cc308b42e Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 14:54:23 -0400 Subject: [PATCH 11/34] Fixed manual test --- .github/workflows/code_changes.yaml | 1 + .github/workflows/manual_tests.yaml | 2 +- policyengine_us_data/datasets/cps/cps.py | 1 + pyproject.toml | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index 6b474227..edd804db 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -2,6 +2,7 @@ name: Code changes on: + workflow_call: push: branches: - main diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml index a2daca18..fb13ba89 100644 --- a/.github/workflows/manual_tests.yaml +++ b/.github/workflows/manual_tests.yaml @@ -11,7 +11,7 @@ on: jobs: test: - uses: ./.github/workflows/pr_changelog.yaml + uses: ./.github/workflows/code_changes.yaml with: TEST_LITE: ${{ github.event.inputs.test_lite }} secrets: inherit diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index fde981ba..177f4707 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2008,6 +2008,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() + print(2 + 2) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index 0352db69..3490ff1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us>=1.197.0", + "policyengine-us>=1.333.0", "policyengine-core>=3.14.1", "requests", "tqdm", From 552de174a3bd6c05199737bc398fb6724e13b7fa Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 15:02:22 -0400 Subject: [PATCH 12/34] try again with locked version --- policyengine_us_data/datasets/cps/cps.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 177f4707..09a594c3 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2008,7 +2008,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 2) + print(2 + 3) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index 3490ff1b..74af05bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us>=1.333.0", + "policyengine-us==1.333.0", "policyengine-core>=3.14.1", "requests", "tqdm", From ab71dd4483ab24ab83ec61fd71260c77cb4bc9ac Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 15:44:32 -0400 Subject: [PATCH 13/34] trying things --- policyengine_us_data/datasets/cps/cps.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 09a594c3..1edce6e9 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -100,9 +100,14 @@ def downsample(self, frac: float): original_dtypes = { key: original_data[key].dtype for key in original_data } - + print("\n\nHERE IS THE PROBLEM-----") + print(f"frac is {frac}") + print(self) + print(Microsimulation) sim = Microsimulation(dataset=self) - sim.subsample(frac=frac) + print(sim) + print(sim.subsample) + #sim.subsample(frac=frac) for key in original_data: if key not in sim.tax_benefit_system.variables: From 57fbddafc1ea429fbebb6e8c50dc0bc585e99115 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 15:45:47 -0400 Subject: [PATCH 14/34] lint --- policyengine_us_data/datasets/cps/cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 1edce6e9..30688719 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -107,7 +107,7 @@ def downsample(self, frac: float): sim = Microsimulation(dataset=self) print(sim) print(sim.subsample) - #sim.subsample(frac=frac) + # sim.subsample(frac=frac) for key in original_data: if key not in sim.tax_benefit_system.variables: From efd267f53c0838af1cad1613baf58868d4a51314 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:05:26 -0400 Subject: [PATCH 15/34] trying 3.11.12 --- policyengine_us_data/datasets/cps/cps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 30688719..8219e915 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -107,7 +107,7 @@ def downsample(self, frac: float): sim = Microsimulation(dataset=self) print(sim) print(sim.subsample) - # sim.subsample(frac=frac) + sim.subsample(frac=frac) for key in original_data: if key not in sim.tax_benefit_system.variables: @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 3) + print(2 + 5) else: CPS_2021().generate() CPS_2022().generate() From eb390dd580c9cd1407f32c5b19dbd8d9784d828f Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:10:26 -0400 Subject: [PATCH 16/34] now actually specifying py version --- policyengine_us_data/datasets/cps/cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 8219e915..a25aba26 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 5) + print(2 + 7) else: CPS_2021().generate() CPS_2022().generate() From 9d69d6dd1cd6fc759f2c33417bdf204c401c170a Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:33:21 -0400 Subject: [PATCH 17/34] pandas v --- policyengine_us_data/datasets/cps/cps.py | 2 +- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index a25aba26..b3554604 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 7) + print(2 + 8) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index 74af05bf..6c767ede 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ requires-python = ">=3.11, <3.13.0" dependencies = [ "policyengine-us==1.333.0", "policyengine-core>=3.14.1", + "pandas==2.3.1", "requests", "tqdm", "microdf_python>=0.4.3", From 54a7f9d1cbafae7a5962095af4bff12673f62fd8 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:47:12 -0400 Subject: [PATCH 18/34] small runner --- .github/workflows/pr_code_changes.yaml | 2 +- policyengine_us_data/datasets/cps/cps.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 213d192f..385e5a4c 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -49,7 +49,7 @@ jobs: run: python -c "from policyengine_core.data import Dataset; print('Core import OK')" Test: - runs-on: larger-runner + runs-on: ubuntu-latest needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index b3554604..027c2ef5 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 8) + print(2 + 0) else: CPS_2021().generate() CPS_2022().generate() From 6122adfeeb1276b1641246d8466388829df013d2 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:53:57 -0400 Subject: [PATCH 19/34] trying everything --- .github/workflows/pr_code_changes.yaml | 2 +- policyengine_us_data/datasets/cps/cps.py | 2 +- pyproject.toml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 385e5a4c..02209591 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -63,7 +63,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.11' + python-version: '3.11.12' - name: Install package run: uv pip install -e .[dev] --system diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 027c2ef5..afbf223f 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 0) + print(2 + 9) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index 6c767ede..d87290a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,11 +15,11 @@ authors = [ {name = "PolicyEngine", email = "hello@policyengine.org"}, ] license = {file = "LICENSE"} -requires-python = ">=3.11, <3.13.0" +requires-python = ">=3.11, <3.11.13" dependencies = [ "policyengine-us==1.333.0", "policyengine-core>=3.14.1", - "pandas==2.3.1", + "pandas==2.3.0", "requests", "tqdm", "microdf_python>=0.4.3", From bcb6b1d8e7454546874d21ac307d0b2a9d5ed08e Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 17:02:45 -0400 Subject: [PATCH 20/34] relaxing python version in pyproject.toml --- policyengine_us_data/datasets/cps/cps.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index afbf223f..3173d4d6 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 9) + print(3 + 9) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index d87290a2..fe5fda52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ authors = [ {name = "PolicyEngine", email = "hello@policyengine.org"}, ] license = {file = "LICENSE"} -requires-python = ">=3.11, <3.11.13" +requires-python = ">=3.11, <3.13.0" dependencies = [ "policyengine-us==1.333.0", "policyengine-core>=3.14.1", From eed999390421a8a9a02b83076a3e3af788aac186 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 17:29:12 -0400 Subject: [PATCH 21/34] putting things back in order. --- policyengine_us_data/datasets/cps/cps.py | 7 ------- pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 3173d4d6..d9957cbb 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -100,13 +100,7 @@ def downsample(self, frac: float): original_dtypes = { key: original_data[key].dtype for key in original_data } - print("\n\nHERE IS THE PROBLEM-----") - print(f"frac is {frac}") - print(self) - print(Microsimulation) sim = Microsimulation(dataset=self) - print(sim) - print(sim.subsample) sim.subsample(frac=frac) for key in original_data: @@ -2013,7 +2007,6 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(3 + 9) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index fe5fda52..4bec19eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us==1.333.0", + "policyengine-us==1.340.1", "policyengine-core>=3.14.1", "pandas==2.3.0", "requests", From 65473ce74d24808388f50599a4bb9179f76c311c Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Sat, 12 Jul 2025 13:01:15 +0100 Subject: [PATCH 22/34] Use normal runner in PR tests --- .github/workflows/pr_code_changes.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 213d192f..4e30d089 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -49,7 +49,7 @@ jobs: run: python -c "from policyengine_core.data import Dataset; print('Core import OK')" Test: - runs-on: larger-runner + runs-on: ubuntu-latest needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} From ff4e9a634e5e727362e881a394d368a291bfcb9b Mon Sep 17 00:00:00 2001 From: baogorek Date: Sat, 12 Jul 2025 09:53:07 -0400 Subject: [PATCH 23/34] added the 3.11.12 pin --- .github/workflows/code_changes.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index b752e953..fef913e9 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -38,7 +38,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.11' + python-version: '3.11.12' - uses: "google-github-actions/auth@v2" with: workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider" From 43ad4394ff074b8419e7fa954e2964682e22bc3c Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 20:00:50 -0400 Subject: [PATCH 24/34] cps.py --- policyengine_us_data/datasets/cps/cps.py | 1 + 1 file changed, 1 insertion(+) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index d9957cbb..202f9c69 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2007,6 +2007,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() + print(3) else: CPS_2021().generate() CPS_2022().generate() From 879770e8aefcaa5c25f503bb877766b24f372d67 Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 20:32:26 -0400 Subject: [PATCH 25/34] adding diagnostics --- .../datasets/cps/enhanced_cps.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index b8af12ce..c01a6b17 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -190,6 +190,23 @@ def generate(self): ) data["household_weight"][year] = optimised_weights + print("\n\n---reweighting quick diagnostics----\n") + estimate = optimised_weights @ loss_matrix + rel_error = ( + ((estimate - targets_array) + 1) / (targets_array + 1) + ) ** 2 + print( + f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}", + f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}" + ) + print("Relative error over 100% for:") + for i in np.where(rel_error > 1)[0]: + print(f"target_name: {loss_matrix.columns[i]}") + print(f"target_value: {targets_array[i]}") + print(f"estimate_value: {estimate[i]}") + print(f"has rel_error: {rel_error.values[i]:.2f}\n") + print("---End of reweighting quick diagnostics------") + self.save_dataset(data) From 10b725cfaeb36041dd5164c830a9ae2292f3a537 Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 20:32:47 -0400 Subject: [PATCH 26/34] lint --- policyengine_us_data/datasets/cps/enhanced_cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index c01a6b17..d4dd5ba6 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -197,7 +197,7 @@ def generate(self): ) ** 2 print( f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}", - f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}" + f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}", ) print("Relative error over 100% for:") for i in np.where(rel_error > 1)[0]: From f0973be60eef539992dbaf75fd850352226e460b Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 22:27:46 -0400 Subject: [PATCH 27/34] taking out bad targets --- policyengine_us_data/datasets/cps/cps.py | 1 - .../datasets/cps/enhanced_cps.py | 62 +++++++++++++++++-- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 202f9c69..d9957cbb 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2007,7 +2007,6 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(3) else: CPS_2021().generate() CPS_2022().generate() diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index d4dd5ba6..dab9df78 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -28,6 +28,7 @@ def reweight( targets_array, dropout_rate=0.05, log_path="calibration_log.csv", + epochs=150, ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -45,7 +46,7 @@ def reweight( np.log(original_weights), requires_grad=True, dtype=torch.float32 ) - # TODO: replace this with a call to the python reweight.py package. + # TODO: replace this functionality from the microcalibrate package. def loss(weights): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): @@ -78,7 +79,7 @@ def dropout_weights(weights, p): start_loss = None - iterator = trange(500) + iterator = trange(epochs) performance = pd.DataFrame() for i in iterator: optimizer.zero_grad() @@ -178,18 +179,71 @@ def generate(self): original_weights = original_weights.values + np.random.normal( 1, 0.1, len(original_weights) ) + + bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "state/RI/adjusted_gross_income/amount/-inf_1", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "state/RI/adjusted_gross_income/amount/-inf_1", + "target_name: nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", + ] + + # Run the optimization procedure to get (close to) minimum loss weights for year in range(self.start_year, self.end_year + 1): loss_matrix, targets_array = build_loss_matrix( self.input_dataset, year ) + zero_mask = np.isclose(targets_array, 0.0, atol=0.1) + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~(zero_mask | bad_mask) + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_array_clean = targets_array[keep_idx] + assert loss_matrix_clean.shape[1] == targets_array_clean.size + optimised_weights = reweight( original_weights, - loss_matrix, - targets_array, + loss_matrix_clean, + targets_array_clean, log_path="calibration_log.csv", + epochs=150, ) data["household_weight"][year] = optimised_weights + print("\n\n---reweighting quick diagnostics----\n") + estimate = optimised_weights @ loss_matrix_clean + rel_error = ( + ((estimate - targets_array_clean) + 1) + / (targets_array_clean + 1) + ) ** 2 + print( + f"rel_error: min: {np.min(rel_error):.2f}, " + f"max: {np.max(rel_error):.2f} " + f"mean: {np.mean(rel_error):.2f}, " + f"median: {np.median(rel_error):.2f}" + ) + print("Relative error over 100% for:") + for i in np.where(rel_error > 1)[0]: + print(f"target_name: {loss_matrix_clean.columns[i]}") + print(f"target_value: {targets_array_clean[i]}") + print(f"estimate_value: {estimate[i]}") + print(f"has rel_error: {rel_error[i]:.2f}\n") + print("---End of reweighting quick diagnostics------") + print("\n\n---reweighting quick diagnostics----\n") estimate = optimised_weights @ loss_matrix rel_error = ( From d8667a19d7641565bb85171dde03a6a55d2ee3c4 Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:05:09 -0400 Subject: [PATCH 28/34] fixing workflow arg passthrough --- .github/workflows/pr_code_changes.yaml | 16 +++++++++++++--- changelog_entry.yaml | 6 ++++++ pyproject.toml | 4 ++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index c84a4b97..56224a2e 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -11,6 +11,14 @@ on: - tests/** - .github/workflows/** + workflow_call: + inputs: + TEST_LITE: + description: 'Run in lite mode' + type: boolean + required: false + default: false + jobs: Lint: runs-on: ubuntu-latest @@ -53,6 +61,7 @@ jobs: needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + TEST_LITE: ${{ inputs.TEST_LITE }} steps: - name: Checkout repo uses: actions/checkout@v2 @@ -63,7 +72,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.11.12' + python-version: '3.11' - name: Install package run: uv pip install -e .[dev] --system @@ -75,8 +84,9 @@ jobs: - name: Build datasets run: make data env: - TEST_LITE: true - PYTHON_LOG_LEVEL: INFO + TEST_LITE: ${{ env.TEST_LITE }} + PYTHON_LOG_LEVEL: INFO + - name: Save calibration log uses: actions/upload-artifact@v4 with: diff --git a/changelog_entry.yaml b/changelog_entry.yaml index dcce3f1a..bce8b349 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,10 @@ - bump: patch changes: changed: + - bad targets (causing problems with estimation) removed - lite mode now builds CPS_2023 in addition to CPS_2024 + - gave reweight an epochs argument and set it at 150 for optimization + - updating minimum versions on policyengine-us and pandas dependencies + fixed: + - manual workflow now can call PR code changes + diff --git a/pyproject.toml b/pyproject.toml index 4bec19eb..481cbc37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,9 +17,9 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us==1.340.1", + "policyengine-us>=1.340.1", "policyengine-core>=3.14.1", - "pandas==2.3.0", + "pandas>=2.3.0", "requests", "tqdm", "microdf_python>=0.4.3", From 8f14e52158b99f5ec3c5cd43279ae0799955c1f0 Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:09:32 -0400 Subject: [PATCH 29/34] deps and defaults --- .github/workflows/code_changes.yaml | 2 +- .github/workflows/pr_code_changes.yaml | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index fef913e9..b752e953 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -38,7 +38,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.11.12' + python-version: '3.11' - uses: "google-github-actions/auth@v2" with: workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider" diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 56224a2e..1e05b564 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -17,7 +17,7 @@ on: description: 'Run in lite mode' type: boolean required: false - default: false + default: true jobs: Lint: diff --git a/pyproject.toml b/pyproject.toml index 481cbc37..f983258d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ "policyengine-us>=1.340.1", - "policyengine-core>=3.14.1", + "policyengine-core>=3.17.1", "pandas>=2.3.0", "requests", "tqdm", From 29e1621d74802624ef647fa442b8e360931a438f Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:12:21 -0400 Subject: [PATCH 30/34] wrong pipeline for manual test --- .github/workflows/manual_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml index fb13ba89..fd6fa061 100644 --- a/.github/workflows/manual_tests.yaml +++ b/.github/workflows/manual_tests.yaml @@ -11,7 +11,7 @@ on: jobs: test: - uses: ./.github/workflows/code_changes.yaml + uses: ./.github/workflows/pr_code_changes.yaml with: TEST_LITE: ${{ github.event.inputs.test_lite }} secrets: inherit From fcd5835d73bcb13291be79e36784cdb9cab47b9b Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:30:46 -0400 Subject: [PATCH 31/34] trying again to get the manual test to work --- .github/workflows/manual_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml index fd6fa061..55667dbc 100644 --- a/.github/workflows/manual_tests.yaml +++ b/.github/workflows/manual_tests.yaml @@ -13,5 +13,5 @@ jobs: test: uses: ./.github/workflows/pr_code_changes.yaml with: - TEST_LITE: ${{ github.event.inputs.test_lite }} + TEST_LITE: ${{ inputs.test_lite }} secrets: inherit From 82272b6458f14ac49d5bf4c82eee4faa4746236b Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:53:27 -0400 Subject: [PATCH 32/34] reverting to older workflow code --- .github/workflows/manual_tests.yaml | 17 ----------------- .github/workflows/pr_code_changes.yaml | 14 ++------------ changelog_entry.yaml | 4 +--- 3 files changed, 3 insertions(+), 32 deletions(-) delete mode 100644 .github/workflows/manual_tests.yaml diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml deleted file mode 100644 index 55667dbc..00000000 --- a/.github/workflows/manual_tests.yaml +++ /dev/null @@ -1,17 +0,0 @@ -name: Manual tests - -on: - workflow_dispatch: - inputs: - test_lite: - description: 'Run in lite mode' - required: true - default: true - type: boolean - -jobs: - test: - uses: ./.github/workflows/pr_code_changes.yaml - with: - TEST_LITE: ${{ inputs.test_lite }} - secrets: inherit diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 1e05b564..4e30d089 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -11,14 +11,6 @@ on: - tests/** - .github/workflows/** - workflow_call: - inputs: - TEST_LITE: - description: 'Run in lite mode' - type: boolean - required: false - default: true - jobs: Lint: runs-on: ubuntu-latest @@ -61,7 +53,6 @@ jobs: needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - TEST_LITE: ${{ inputs.TEST_LITE }} steps: - name: Checkout repo uses: actions/checkout@v2 @@ -84,9 +75,8 @@ jobs: - name: Build datasets run: make data env: - TEST_LITE: ${{ env.TEST_LITE }} - PYTHON_LOG_LEVEL: INFO - + TEST_LITE: true + PYTHON_LOG_LEVEL: INFO - name: Save calibration log uses: actions/upload-artifact@v4 with: diff --git a/changelog_entry.yaml b/changelog_entry.yaml index bce8b349..3f9b8627 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -5,6 +5,4 @@ - lite mode now builds CPS_2023 in addition to CPS_2024 - gave reweight an epochs argument and set it at 150 for optimization - updating minimum versions on policyengine-us and pandas dependencies - fixed: - - manual workflow now can call PR code changes - + - getting rid of non-working manual workflow code From ab8fa4f1b6cfc40e6f8ebfb70b2c5f85580b774e Mon Sep 17 00:00:00 2001 From: baogorek Date: Mon, 14 Jul 2025 00:12:37 -0400 Subject: [PATCH 33/34] cleaning up enhanced_cps.py --- .../datasets/cps/enhanced_cps.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index dab9df78..7a471d40 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -199,7 +199,7 @@ def generate(self): "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", "state/RI/adjusted_gross_income/amount/-inf_1", - "target_name: nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", + "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", ] # Run the optimization procedure to get (close to) minimum loss weights @@ -244,23 +244,6 @@ def generate(self): print(f"has rel_error: {rel_error[i]:.2f}\n") print("---End of reweighting quick diagnostics------") - print("\n\n---reweighting quick diagnostics----\n") - estimate = optimised_weights @ loss_matrix - rel_error = ( - ((estimate - targets_array) + 1) / (targets_array + 1) - ) ** 2 - print( - f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}", - f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}", - ) - print("Relative error over 100% for:") - for i in np.where(rel_error > 1)[0]: - print(f"target_name: {loss_matrix.columns[i]}") - print(f"target_value: {targets_array[i]}") - print(f"estimate_value: {estimate[i]}") - print(f"has rel_error: {rel_error.values[i]:.2f}\n") - print("---End of reweighting quick diagnostics------") - self.save_dataset(data) From 2508741c8df49861868e89da94c02075b404c86f Mon Sep 17 00:00:00 2001 From: MaxGhenis Date: Mon, 14 Jul 2025 15:33:13 +0000 Subject: [PATCH 34/34] Update package version --- CHANGELOG.md | 11 +++++++++++ changelog.yaml | 9 +++++++++ changelog_entry.yaml | 8 -------- pyproject.toml | 2 +- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6299d8fb..e355d4dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.37.1] - 2025-07-14 15:33:11 + +### Changed + +- bad targets (causing problems with estimation) removed +- lite mode now builds CPS_2023 in addition to CPS_2024 +- gave reweight an epochs argument and set it at 150 for optimization +- updating minimum versions on policyengine-us and pandas dependencies +- getting rid of non-working manual workflow code + ## [1.37.0] - 2025-07-09 14:58:33 ### Added @@ -520,6 +530,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +[1.37.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.37.0...1.37.1 [1.37.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.2...1.37.0 [1.36.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.1...1.36.2 [1.36.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.0...1.36.1 diff --git a/changelog.yaml b/changelog.yaml index 699b2430..af7cdf32 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -433,3 +433,12 @@ added: - Medicaid state level calibration targets. date: 2025-07-09 14:58:33 +- bump: patch + changes: + changed: + - bad targets (causing problems with estimation) removed + - lite mode now builds CPS_2023 in addition to CPS_2024 + - gave reweight an epochs argument and set it at 150 for optimization + - updating minimum versions on policyengine-us and pandas dependencies + - getting rid of non-working manual workflow code + date: 2025-07-14 15:33:11 diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 3f9b8627..e69de29b 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,8 +0,0 @@ -- bump: patch - changes: - changed: - - bad targets (causing problems with estimation) removed - - lite mode now builds CPS_2023 in addition to CPS_2024 - - gave reweight an epochs argument and set it at 150 for optimization - - updating minimum versions on policyengine-us and pandas dependencies - - getting rid of non-working manual workflow code diff --git a/pyproject.toml b/pyproject.toml index f983258d..5a75693f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "policyengine_us_data" -version = "1.37.0" +version = "1.37.1" description = "A package to create representative microdata for the US." readme = "README.md" authors = [