From 027a273072f036fedf39a32d8d55179a7c3ad478 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 5 Oct 2025 15:11:32 -0400 Subject: [PATCH 1/4] Move all randomness to data package for deterministic country package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change moves ALL random number generation from policyengine-us into the dataset generation in policyengine-us-data. The country package is now a purely deterministic rules engine. ## Key Changes ### policyengine-us-data: - Add take-up rate YAML parameter files in `parameters/take_up/` - Generate all stochastic boolean take-up decisions in CPS dataset - Use seeded RNG (seed=100) for full reproducibility ### Stochastic variables generated: **Take-up decisions (boolean):** - takes_up_snap_if_eligible - takes_up_aca_if_eligible - takes_up_medicaid_if_eligible - takes_up_eitc (already boolean) - takes_up_dc_ptc (already boolean) All random generation now uses np.random.default_rng(seed=100) for full reproducibility across dataset builds. ## Trade-offs **IMPORTANT**: Take-up rates can no longer be adjusted dynamically via policy reforms or in the web app. They are fixed in the microdata. This is an acceptable trade-off for the cleaner architecture of keeping the country package purely deterministic. To adjust take-up rates, the microdata must be regenerated. Related: policyengine-us PR (must be merged after this) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- household_counts.txt | 12 +++++ policyengine_us_data/datasets/cps/cps.py | 40 ++++++++++---- policyengine_us_data/parameters/__init__.py | 53 +++++++++++++++++++ .../parameters/take_up/aca.yaml | 10 ++++ .../parameters/take_up/dc_ptc.yaml | 11 ++++ .../parameters/take_up/eitc.yaml | 12 +++++ .../parameters/take_up/medicaid.yaml | 10 ++++ .../parameters/take_up/snap.yaml | 9 ++++ 8 files changed, 148 insertions(+), 9 deletions(-) create mode 100644 household_counts.txt create mode 100644 policyengine_us_data/parameters/__init__.py create mode 100644 policyengine_us_data/parameters/take_up/aca.yaml create mode 100644 policyengine_us_data/parameters/take_up/dc_ptc.yaml create mode 100644 policyengine_us_data/parameters/take_up/eitc.yaml create mode 100644 policyengine_us_data/parameters/take_up/medicaid.yaml create mode 100644 policyengine_us_data/parameters/take_up/snap.yaml diff --git a/household_counts.txt b/household_counts.txt new file mode 100644 index 00000000..b336dd0a --- /dev/null +++ b/household_counts.txt @@ -0,0 +1,12 @@ +L0 Value | Household Count +---------|---------------- +4.9999e-07 | 7,977 +4.999e-07 | 8,005 +5.0000078125e-07 | 7,784 +5.000015625e-07 | 7,617 +5.00003125e-07 | 8,069 +5.0000625e-07 | 7,908 +5.000125e-07 | 7,937 +5.00025e-07 | 7,889 +5.0005e-07 | 7,935 +5.002e-07 | 7,897 diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index f932e0d5..6fe0c373 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -14,6 +14,7 @@ ) from microimpute.models.qrf import QRF import logging +from policyengine_us_data.parameters import load_take_up_rate test_lite = os.environ.get("TEST_LITE") == "true" @@ -207,25 +208,46 @@ def add_takeup(self): from policyengine_us import system, Microsimulation baseline = Microsimulation(dataset=self) - parameters = baseline.tax_benefit_system.parameters(self.time_period) + # Generate all stochastic take-up decisions using take-up rates from parameter files + # This keeps the country package purely deterministic generator = np.random.default_rng(seed=100) - eitc_takeup_rates = parameters.gov.irs.credits.eitc.takeup + # Load take-up rates from parameter files + eitc_rates_by_children = load_take_up_rate("eitc", self.time_period) + dc_ptc_rate = load_take_up_rate("dc_ptc", self.time_period) + snap_rate = load_take_up_rate("snap", self.time_period) + aca_rate = load_take_up_rate("aca", self.time_period) + medicaid_rate = load_take_up_rate("medicaid", self.time_period) + + # EITC: varies by number of children eitc_child_count = baseline.calculate("eitc_child_count").values - eitc_takeup_rate = eitc_takeup_rates.calc(eitc_child_count) + eitc_takeup_rate = np.array( + [eitc_rates_by_children.get(min(int(c), 3), 0.85) for c in eitc_child_count] + ) data["takes_up_eitc"] = ( generator.random(len(data["tax_unit_id"])) < eitc_takeup_rate ) - dc_ptc_takeup_rate = parameters.gov.states.dc.tax.income.credits.ptc.takeup + + # DC Property Tax Credit data["takes_up_dc_ptc"] = ( - generator.random(len(data["tax_unit_id"])) < dc_ptc_takeup_rate + generator.random(len(data["tax_unit_id"])) < dc_ptc_rate ) - generator = np.random.default_rng(seed=100) - data["snap_take_up_seed"] = generator.random(len(data["spm_unit_id"])) - data["aca_take_up_seed"] = generator.random(len(data["tax_unit_id"])) - data["medicaid_take_up_seed"] = generator.random(len(data["person_id"])) + # SNAP + data["takes_up_snap_if_eligible"] = ( + generator.random(len(data["spm_unit_id"])) < snap_rate + ) + + # ACA + data["takes_up_aca_if_eligible"] = ( + generator.random(len(data["tax_unit_id"])) < aca_rate + ) + + # Medicaid + data["takes_up_medicaid_if_eligible"] = ( + generator.random(len(data["person_id"])) < medicaid_rate + ) self.save_dataset(data) diff --git a/policyengine_us_data/parameters/__init__.py b/policyengine_us_data/parameters/__init__.py new file mode 100644 index 00000000..a8acad19 --- /dev/null +++ b/policyengine_us_data/parameters/__init__.py @@ -0,0 +1,53 @@ +""" +Take-up rate parameters for stochastic simulation. + +These parameters are stored in the data package to keep the country package +as a purely deterministic rules engine. +""" +import yaml +from pathlib import Path + +PARAMETERS_DIR = Path(__file__).parent + + +def load_take_up_rate(variable_name: str, year: int = 2018) -> float: + """Load take-up rate from YAML parameter files. + + Args: + variable_name: Name of the take-up parameter file (without .yaml) + year: Year for which to get the rate + + Returns: + Take-up rate as a float between 0 and 1 + """ + yaml_path = PARAMETERS_DIR / "take_up" / f"{variable_name}.yaml" + + with open(yaml_path) as f: + data = yaml.safe_load(f) + + # Handle EITC special case (has rates_by_children instead of values) + if "rates_by_children" in data: + return data["rates_by_children"] # Return the dict + + # Find the applicable value for the year + values = data["values"] + applicable_value = None + + for date_key, value in sorted(values.items()): + # Handle both string and datetime.date objects from YAML + if hasattr(date_key, "year"): + # It's a datetime.date object + date_year = date_key.year + else: + # It's a string + date_year = int(date_key.split("-")[0]) + + if date_year <= year: + applicable_value = value + else: + break + + if applicable_value is None: + raise ValueError(f"No take-up rate found for {variable_name} in {year}") + + return applicable_value diff --git a/policyengine_us_data/parameters/take_up/aca.yaml b/policyengine_us_data/parameters/take_up/aca.yaml new file mode 100644 index 00000000..98f92014 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/aca.yaml @@ -0,0 +1,10 @@ +description: Percentage of eligible people who do enroll in Affordable Care Act coverage, if eligible. +metadata: + label: ACA takeup rate + unit: /1 + period: year + reference: + - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP" + href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the +values: + 2018-01-01: 0.672 diff --git a/policyengine_us_data/parameters/take_up/dc_ptc.yaml b/policyengine_us_data/parameters/take_up/dc_ptc.yaml new file mode 100644 index 00000000..6195ecf3 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/dc_ptc.yaml @@ -0,0 +1,11 @@ +description: The share of eligible individuals who claim the DC property tax credit. +metadata: + unit: /1 + label: DC property tax credit takeup rate + period: year + reference: + - title: District of Columbia Tax Expenditure Report, 2024 + href: https://ora-cfo.dc.gov/sites/default/files/dc/sites/ora-cfo/publication/attachments/2024%20Tax%20Expenditure%20Report.pdf#page=234 +values: + # 37,133 (from 2024 Tax Expenditure Report) / 131,791,388 (PolicyEngine DC PTC value estimate) + 2021-01-01: 0.32 diff --git a/policyengine_us_data/parameters/take_up/eitc.yaml b/policyengine_us_data/parameters/take_up/eitc.yaml new file mode 100644 index 00000000..17aa9daa --- /dev/null +++ b/policyengine_us_data/parameters/take_up/eitc.yaml @@ -0,0 +1,12 @@ +description: The share of eligible individuals who claim the EITC (by number of children). +metadata: + label: EITC take-up rate by number of children + reference: + - title: National Taxpayer Advocate Special Report to Congress 2020 | IRS + href: https://www.taxpayeradvocate.irs.gov/wp-content/uploads/2020/08/JRC20_Volume3.pdf#page=62 +# Maps number of children to take-up rate +rates_by_children: + 0: 0.65 + 1: 0.86 + 2: 0.85 + 3: 0.85 # Assume same as 2 diff --git a/policyengine_us_data/parameters/take_up/medicaid.yaml b/policyengine_us_data/parameters/take_up/medicaid.yaml new file mode 100644 index 00000000..cfdf919a --- /dev/null +++ b/policyengine_us_data/parameters/take_up/medicaid.yaml @@ -0,0 +1,10 @@ +description: Percentage of people who do enroll in Medicaid, if eligible. +metadata: + label: Medicaid takeup rate + unit: /1 + period: year + reference: + - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP" + href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the +values: + 2018-01-01: 0.93 diff --git a/policyengine_us_data/parameters/take_up/snap.yaml b/policyengine_us_data/parameters/take_up/snap.yaml new file mode 100644 index 00000000..12b6012e --- /dev/null +++ b/policyengine_us_data/parameters/take_up/snap.yaml @@ -0,0 +1,9 @@ +description: Percentage of eligible SNAP recipients who claim SNAP. +metadata: + label: SNAP takeup rate + unit: /1 + reference: + - title: USDA + href: https://www.fns.usda.gov/usamap +values: + 2018-01-01: 0.82 From 39fca44a96a306f9dcb9a98ee46e4d2a9452a51e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 10 Nov 2025 05:54:03 -0600 Subject: [PATCH 2/4] Add Head Start and Early Head Start takeup generation - Create takeup parameter files with rates from NIEER report - Head Start: 40% (pre-pandemic), 30% (pandemic 2020-2021) - Early Head Start: 9% - Generate stochastic takeup in CPS dataset using same pattern as SNAP/Medicaid - Coordinates with policyengine-us PR adding takeup variables --- policyengine_us_data/datasets/cps/cps.py | 12 ++++++++++++ .../parameters/take_up/early_head_start.yaml | 9 +++++++++ .../parameters/take_up/head_start.yaml | 10 ++++++++++ 3 files changed, 31 insertions(+) create mode 100644 policyengine_us_data/parameters/take_up/early_head_start.yaml create mode 100644 policyengine_us_data/parameters/take_up/head_start.yaml diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 6fe0c373..0f5bab60 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -219,6 +219,8 @@ def add_takeup(self): snap_rate = load_take_up_rate("snap", self.time_period) aca_rate = load_take_up_rate("aca", self.time_period) medicaid_rate = load_take_up_rate("medicaid", self.time_period) + head_start_rate = load_take_up_rate("head_start", self.time_period) + early_head_start_rate = load_take_up_rate("early_head_start", self.time_period) # EITC: varies by number of children eitc_child_count = baseline.calculate("eitc_child_count").values @@ -249,6 +251,16 @@ def add_takeup(self): generator.random(len(data["person_id"])) < medicaid_rate ) + # Head Start + data["takes_up_head_start_if_eligible"] = ( + generator.random(len(data["person_id"])) < head_start_rate + ) + + # Early Head Start + data["takes_up_early_head_start_if_eligible"] = ( + generator.random(len(data["person_id"])) < early_head_start_rate + ) + self.save_dataset(data) diff --git a/policyengine_us_data/parameters/take_up/early_head_start.yaml b/policyengine_us_data/parameters/take_up/early_head_start.yaml new file mode 100644 index 00000000..3802d988 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/early_head_start.yaml @@ -0,0 +1,9 @@ +description: Percentage of eligible infants and toddlers who enroll in Early Head Start. +metadata: + label: Early Head Start take-up rate + unit: /1 + reference: + - title: NIEER State(s) of Head Start and Early Head Start Report + href: https://nieer.org/research-library/states-head-start-early-head-start +values: + 2020-09-01: 0.09 diff --git a/policyengine_us_data/parameters/take_up/head_start.yaml b/policyengine_us_data/parameters/take_up/head_start.yaml new file mode 100644 index 00000000..9495f44b --- /dev/null +++ b/policyengine_us_data/parameters/take_up/head_start.yaml @@ -0,0 +1,10 @@ +description: Percentage of eligible children who enroll in Head Start. +metadata: + label: Head Start take-up rate + unit: /1 + reference: + - title: NIEER State(s) of Head Start and Early Head Start Report + href: https://nieer.org/research-library/states-head-start-early-head-start +values: + 2020-09-01: 0.40 + 2021-09-01: 0.30 From a502f1a6152abd3ed04d06e588217f0e7abba043 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 3 Dec 2025 17:20:56 -0500 Subject: [PATCH 3/4] Add changelog entry and remove debug file --- changelog_entry.yaml | 4 ++++ household_counts.txt | 12 ------------ policyengine_us_data/datasets/cps/cps.py | 9 +++++++-- policyengine_us_data/parameters/__init__.py | 5 ++++- 4 files changed, 15 insertions(+), 15 deletions(-) delete mode 100644 household_counts.txt diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..1a8cc410 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Move all randomness to data package for deterministic country package. Take-up decisions for SNAP, Medicaid, ACA, EITC, DC PTC, Head Start, and Early Head Start are now generated stochastically during dataset creation using take-up rates from YAML parameter files. diff --git a/household_counts.txt b/household_counts.txt deleted file mode 100644 index b336dd0a..00000000 --- a/household_counts.txt +++ /dev/null @@ -1,12 +0,0 @@ -L0 Value | Household Count ----------|---------------- -4.9999e-07 | 7,977 -4.999e-07 | 8,005 -5.0000078125e-07 | 7,784 -5.000015625e-07 | 7,617 -5.00003125e-07 | 8,069 -5.0000625e-07 | 7,908 -5.000125e-07 | 7,937 -5.00025e-07 | 7,889 -5.0005e-07 | 7,935 -5.002e-07 | 7,897 diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 0f5bab60..db637fe4 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -220,12 +220,17 @@ def add_takeup(self): aca_rate = load_take_up_rate("aca", self.time_period) medicaid_rate = load_take_up_rate("medicaid", self.time_period) head_start_rate = load_take_up_rate("head_start", self.time_period) - early_head_start_rate = load_take_up_rate("early_head_start", self.time_period) + early_head_start_rate = load_take_up_rate( + "early_head_start", self.time_period + ) # EITC: varies by number of children eitc_child_count = baseline.calculate("eitc_child_count").values eitc_takeup_rate = np.array( - [eitc_rates_by_children.get(min(int(c), 3), 0.85) for c in eitc_child_count] + [ + eitc_rates_by_children.get(min(int(c), 3), 0.85) + for c in eitc_child_count + ] ) data["takes_up_eitc"] = ( generator.random(len(data["tax_unit_id"])) < eitc_takeup_rate diff --git a/policyengine_us_data/parameters/__init__.py b/policyengine_us_data/parameters/__init__.py index a8acad19..0e5856a6 100644 --- a/policyengine_us_data/parameters/__init__.py +++ b/policyengine_us_data/parameters/__init__.py @@ -4,6 +4,7 @@ These parameters are stored in the data package to keep the country package as a purely deterministic rules engine. """ + import yaml from pathlib import Path @@ -48,6 +49,8 @@ def load_take_up_rate(variable_name: str, year: int = 2018) -> float: break if applicable_value is None: - raise ValueError(f"No take-up rate found for {variable_name} in {year}") + raise ValueError( + f"No take-up rate found for {variable_name} in {year}" + ) return applicable_value From 4b70d6435b3b9f6637bf5fdb175ac38231ffea1c Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 3 Dec 2025 18:02:39 -0500 Subject: [PATCH 4/4] Add tests for stochastic variable generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests verify: - Take-up rate parameters load correctly (EITC, SNAP, Medicaid, etc.) - Seeded RNG produces deterministic results - Take-up proportions match expected rates 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../tests/test_stochastic_variables.py | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 policyengine_us_data/tests/test_stochastic_variables.py diff --git a/policyengine_us_data/tests/test_stochastic_variables.py b/policyengine_us_data/tests/test_stochastic_variables.py new file mode 100644 index 00000000..e4d896e4 --- /dev/null +++ b/policyengine_us_data/tests/test_stochastic_variables.py @@ -0,0 +1,110 @@ +"""Tests for stochastic variable generation in the data package. + +These tests verify that: +1. Take-up rate parameters load correctly +2. Seeded RNG produces deterministic results +3. Take-up rates produce plausible proportions +""" + +import pytest +import numpy as np +from policyengine_us_data.parameters import load_take_up_rate + + +class TestTakeUpRateParameters: + """Test that take-up rate parameters load correctly.""" + + def test_eitc_rate_loads(self): + """EITC take-up rates should load and be plausible.""" + rates = load_take_up_rate("eitc", 2022) + # EITC rates are by number of children: 0, 1, 2, 3+ + assert isinstance(rates, dict) or isinstance(rates, float) + if isinstance(rates, dict): + for key, rate in rates.items(): + assert 0 < rate <= 1 + + def test_snap_rate_loads(self): + """SNAP take-up rate should load and be plausible.""" + rate = load_take_up_rate("snap", 2022) + assert 0 < rate <= 1 + + def test_medicaid_rate_loads(self): + """Medicaid take-up rate should load and be plausible.""" + rate = load_take_up_rate("medicaid", 2022) + assert 0 < rate <= 1 + + def test_aca_rate_loads(self): + """ACA take-up rate should load and be plausible.""" + rate = load_take_up_rate("aca", 2022) + assert 0 < rate <= 1 + + def test_head_start_rate_loads(self): + """Head Start take-up rate should load and be plausible.""" + rate = load_take_up_rate("head_start", 2022) + assert 0 < rate <= 1 + + def test_early_head_start_rate_loads(self): + """Early Head Start take-up rate should load and be plausible.""" + rate = load_take_up_rate("early_head_start", 2022) + assert 0 < rate <= 1 + + def test_dc_ptc_rate_loads(self): + """DC PTC take-up rate should load and be plausible.""" + rate = load_take_up_rate("dc_ptc", 2022) + assert 0 < rate <= 1 + + +class TestSeededRandomness: + """Test that stochastic generation is deterministic.""" + + def test_same_seed_produces_same_results(self): + """Using the same seed should produce identical results.""" + seed = 0 + n = 1_000 + + generator1 = np.random.default_rng(seed=seed) + result1 = generator1.random(n) + + generator2 = np.random.default_rng(seed=seed) + result2 = generator2.random(n) + + np.testing.assert_array_equal(result1, result2) + + def test_different_seeds_produce_different_results(self): + """Different seeds should produce different results.""" + n = 1_000 + + generator1 = np.random.default_rng(seed=0) + result1 = generator1.random(n) + + generator2 = np.random.default_rng(seed=1) + result2 = generator2.random(n) + + assert not np.array_equal(result1, result2) + + +class TestTakeUpProportions: + """Test that take-up rates produce plausible proportions.""" + + def test_take_up_produces_expected_proportion(self): + """Simulated take-up should match the rate approximately.""" + rate = 0.7 + n = 10_000 + generator = np.random.default_rng(seed=42) + + take_up = generator.random(n) < rate + actual_proportion = take_up.mean() + + # Should be within 5 percentage points of the rate + assert abs(actual_proportion - rate) < 0.05 + + def test_boolean_generation(self): + """Take-up decisions should be boolean.""" + rate = 0.5 + n = 100 + generator = np.random.default_rng(seed=42) + + take_up = generator.random(n) < rate + + assert take_up.dtype == bool + assert set(take_up).issubset({True, False})