diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..1a8cc410 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Move all randomness to data package for deterministic country package. Take-up decisions for SNAP, Medicaid, ACA, EITC, DC PTC, Head Start, and Early Head Start are now generated stochastically during dataset creation using take-up rates from YAML parameter files. diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index f932e0d5..db637fe4 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -14,6 +14,7 @@ ) from microimpute.models.qrf import QRF import logging +from policyengine_us_data.parameters import load_take_up_rate test_lite = os.environ.get("TEST_LITE") == "true" @@ -207,25 +208,63 @@ def add_takeup(self): from policyengine_us import system, Microsimulation baseline = Microsimulation(dataset=self) - parameters = baseline.tax_benefit_system.parameters(self.time_period) + # Generate all stochastic take-up decisions using take-up rates from parameter files + # This keeps the country package purely deterministic generator = np.random.default_rng(seed=100) - eitc_takeup_rates = parameters.gov.irs.credits.eitc.takeup + # Load take-up rates from parameter files + eitc_rates_by_children = load_take_up_rate("eitc", self.time_period) + dc_ptc_rate = load_take_up_rate("dc_ptc", self.time_period) + snap_rate = load_take_up_rate("snap", self.time_period) + aca_rate = load_take_up_rate("aca", self.time_period) + medicaid_rate = load_take_up_rate("medicaid", self.time_period) + head_start_rate = load_take_up_rate("head_start", self.time_period) + early_head_start_rate = load_take_up_rate( + "early_head_start", self.time_period + ) + + # EITC: varies by number of children eitc_child_count = baseline.calculate("eitc_child_count").values - eitc_takeup_rate = eitc_takeup_rates.calc(eitc_child_count) + eitc_takeup_rate = np.array( + [ + eitc_rates_by_children.get(min(int(c), 3), 0.85) + for c in eitc_child_count + ] + ) data["takes_up_eitc"] = ( generator.random(len(data["tax_unit_id"])) < eitc_takeup_rate ) - dc_ptc_takeup_rate = parameters.gov.states.dc.tax.income.credits.ptc.takeup + + # DC Property Tax Credit data["takes_up_dc_ptc"] = ( - generator.random(len(data["tax_unit_id"])) < dc_ptc_takeup_rate + generator.random(len(data["tax_unit_id"])) < dc_ptc_rate + ) + + # SNAP + data["takes_up_snap_if_eligible"] = ( + generator.random(len(data["spm_unit_id"])) < snap_rate + ) + + # ACA + data["takes_up_aca_if_eligible"] = ( + generator.random(len(data["tax_unit_id"])) < aca_rate + ) + + # Medicaid + data["takes_up_medicaid_if_eligible"] = ( + generator.random(len(data["person_id"])) < medicaid_rate + ) + + # Head Start + data["takes_up_head_start_if_eligible"] = ( + generator.random(len(data["person_id"])) < head_start_rate ) - generator = np.random.default_rng(seed=100) - data["snap_take_up_seed"] = generator.random(len(data["spm_unit_id"])) - data["aca_take_up_seed"] = generator.random(len(data["tax_unit_id"])) - data["medicaid_take_up_seed"] = generator.random(len(data["person_id"])) + # Early Head Start + data["takes_up_early_head_start_if_eligible"] = ( + generator.random(len(data["person_id"])) < early_head_start_rate + ) self.save_dataset(data) diff --git a/policyengine_us_data/parameters/__init__.py b/policyengine_us_data/parameters/__init__.py new file mode 100644 index 00000000..0e5856a6 --- /dev/null +++ b/policyengine_us_data/parameters/__init__.py @@ -0,0 +1,56 @@ +""" +Take-up rate parameters for stochastic simulation. + +These parameters are stored in the data package to keep the country package +as a purely deterministic rules engine. +""" + +import yaml +from pathlib import Path + +PARAMETERS_DIR = Path(__file__).parent + + +def load_take_up_rate(variable_name: str, year: int = 2018) -> float: + """Load take-up rate from YAML parameter files. + + Args: + variable_name: Name of the take-up parameter file (without .yaml) + year: Year for which to get the rate + + Returns: + Take-up rate as a float between 0 and 1 + """ + yaml_path = PARAMETERS_DIR / "take_up" / f"{variable_name}.yaml" + + with open(yaml_path) as f: + data = yaml.safe_load(f) + + # Handle EITC special case (has rates_by_children instead of values) + if "rates_by_children" in data: + return data["rates_by_children"] # Return the dict + + # Find the applicable value for the year + values = data["values"] + applicable_value = None + + for date_key, value in sorted(values.items()): + # Handle both string and datetime.date objects from YAML + if hasattr(date_key, "year"): + # It's a datetime.date object + date_year = date_key.year + else: + # It's a string + date_year = int(date_key.split("-")[0]) + + if date_year <= year: + applicable_value = value + else: + break + + if applicable_value is None: + raise ValueError( + f"No take-up rate found for {variable_name} in {year}" + ) + + return applicable_value diff --git a/policyengine_us_data/parameters/take_up/aca.yaml b/policyengine_us_data/parameters/take_up/aca.yaml new file mode 100644 index 00000000..98f92014 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/aca.yaml @@ -0,0 +1,10 @@ +description: Percentage of eligible people who do enroll in Affordable Care Act coverage, if eligible. +metadata: + label: ACA takeup rate + unit: /1 + period: year + reference: + - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP" + href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the +values: + 2018-01-01: 0.672 diff --git a/policyengine_us_data/parameters/take_up/dc_ptc.yaml b/policyengine_us_data/parameters/take_up/dc_ptc.yaml new file mode 100644 index 00000000..6195ecf3 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/dc_ptc.yaml @@ -0,0 +1,11 @@ +description: The share of eligible individuals who claim the DC property tax credit. +metadata: + unit: /1 + label: DC property tax credit takeup rate + period: year + reference: + - title: District of Columbia Tax Expenditure Report, 2024 + href: https://ora-cfo.dc.gov/sites/default/files/dc/sites/ora-cfo/publication/attachments/2024%20Tax%20Expenditure%20Report.pdf#page=234 +values: + # 37,133 (from 2024 Tax Expenditure Report) / 131,791,388 (PolicyEngine DC PTC value estimate) + 2021-01-01: 0.32 diff --git a/policyengine_us_data/parameters/take_up/early_head_start.yaml b/policyengine_us_data/parameters/take_up/early_head_start.yaml new file mode 100644 index 00000000..3802d988 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/early_head_start.yaml @@ -0,0 +1,9 @@ +description: Percentage of eligible infants and toddlers who enroll in Early Head Start. +metadata: + label: Early Head Start take-up rate + unit: /1 + reference: + - title: NIEER State(s) of Head Start and Early Head Start Report + href: https://nieer.org/research-library/states-head-start-early-head-start +values: + 2020-09-01: 0.09 diff --git a/policyengine_us_data/parameters/take_up/eitc.yaml b/policyengine_us_data/parameters/take_up/eitc.yaml new file mode 100644 index 00000000..17aa9daa --- /dev/null +++ b/policyengine_us_data/parameters/take_up/eitc.yaml @@ -0,0 +1,12 @@ +description: The share of eligible individuals who claim the EITC (by number of children). +metadata: + label: EITC take-up rate by number of children + reference: + - title: National Taxpayer Advocate Special Report to Congress 2020 | IRS + href: https://www.taxpayeradvocate.irs.gov/wp-content/uploads/2020/08/JRC20_Volume3.pdf#page=62 +# Maps number of children to take-up rate +rates_by_children: + 0: 0.65 + 1: 0.86 + 2: 0.85 + 3: 0.85 # Assume same as 2 diff --git a/policyengine_us_data/parameters/take_up/head_start.yaml b/policyengine_us_data/parameters/take_up/head_start.yaml new file mode 100644 index 00000000..9495f44b --- /dev/null +++ b/policyengine_us_data/parameters/take_up/head_start.yaml @@ -0,0 +1,10 @@ +description: Percentage of eligible children who enroll in Head Start. +metadata: + label: Head Start take-up rate + unit: /1 + reference: + - title: NIEER State(s) of Head Start and Early Head Start Report + href: https://nieer.org/research-library/states-head-start-early-head-start +values: + 2020-09-01: 0.40 + 2021-09-01: 0.30 diff --git a/policyengine_us_data/parameters/take_up/medicaid.yaml b/policyengine_us_data/parameters/take_up/medicaid.yaml new file mode 100644 index 00000000..cfdf919a --- /dev/null +++ b/policyengine_us_data/parameters/take_up/medicaid.yaml @@ -0,0 +1,10 @@ +description: Percentage of people who do enroll in Medicaid, if eligible. +metadata: + label: Medicaid takeup rate + unit: /1 + period: year + reference: + - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP" + href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the +values: + 2018-01-01: 0.93 diff --git a/policyengine_us_data/parameters/take_up/snap.yaml b/policyengine_us_data/parameters/take_up/snap.yaml new file mode 100644 index 00000000..12b6012e --- /dev/null +++ b/policyengine_us_data/parameters/take_up/snap.yaml @@ -0,0 +1,9 @@ +description: Percentage of eligible SNAP recipients who claim SNAP. +metadata: + label: SNAP takeup rate + unit: /1 + reference: + - title: USDA + href: https://www.fns.usda.gov/usamap +values: + 2018-01-01: 0.82 diff --git a/policyengine_us_data/tests/test_stochastic_variables.py b/policyengine_us_data/tests/test_stochastic_variables.py new file mode 100644 index 00000000..e4d896e4 --- /dev/null +++ b/policyengine_us_data/tests/test_stochastic_variables.py @@ -0,0 +1,110 @@ +"""Tests for stochastic variable generation in the data package. + +These tests verify that: +1. Take-up rate parameters load correctly +2. Seeded RNG produces deterministic results +3. Take-up rates produce plausible proportions +""" + +import pytest +import numpy as np +from policyengine_us_data.parameters import load_take_up_rate + + +class TestTakeUpRateParameters: + """Test that take-up rate parameters load correctly.""" + + def test_eitc_rate_loads(self): + """EITC take-up rates should load and be plausible.""" + rates = load_take_up_rate("eitc", 2022) + # EITC rates are by number of children: 0, 1, 2, 3+ + assert isinstance(rates, dict) or isinstance(rates, float) + if isinstance(rates, dict): + for key, rate in rates.items(): + assert 0 < rate <= 1 + + def test_snap_rate_loads(self): + """SNAP take-up rate should load and be plausible.""" + rate = load_take_up_rate("snap", 2022) + assert 0 < rate <= 1 + + def test_medicaid_rate_loads(self): + """Medicaid take-up rate should load and be plausible.""" + rate = load_take_up_rate("medicaid", 2022) + assert 0 < rate <= 1 + + def test_aca_rate_loads(self): + """ACA take-up rate should load and be plausible.""" + rate = load_take_up_rate("aca", 2022) + assert 0 < rate <= 1 + + def test_head_start_rate_loads(self): + """Head Start take-up rate should load and be plausible.""" + rate = load_take_up_rate("head_start", 2022) + assert 0 < rate <= 1 + + def test_early_head_start_rate_loads(self): + """Early Head Start take-up rate should load and be plausible.""" + rate = load_take_up_rate("early_head_start", 2022) + assert 0 < rate <= 1 + + def test_dc_ptc_rate_loads(self): + """DC PTC take-up rate should load and be plausible.""" + rate = load_take_up_rate("dc_ptc", 2022) + assert 0 < rate <= 1 + + +class TestSeededRandomness: + """Test that stochastic generation is deterministic.""" + + def test_same_seed_produces_same_results(self): + """Using the same seed should produce identical results.""" + seed = 0 + n = 1_000 + + generator1 = np.random.default_rng(seed=seed) + result1 = generator1.random(n) + + generator2 = np.random.default_rng(seed=seed) + result2 = generator2.random(n) + + np.testing.assert_array_equal(result1, result2) + + def test_different_seeds_produce_different_results(self): + """Different seeds should produce different results.""" + n = 1_000 + + generator1 = np.random.default_rng(seed=0) + result1 = generator1.random(n) + + generator2 = np.random.default_rng(seed=1) + result2 = generator2.random(n) + + assert not np.array_equal(result1, result2) + + +class TestTakeUpProportions: + """Test that take-up rates produce plausible proportions.""" + + def test_take_up_produces_expected_proportion(self): + """Simulated take-up should match the rate approximately.""" + rate = 0.7 + n = 10_000 + generator = np.random.default_rng(seed=42) + + take_up = generator.random(n) < rate + actual_proportion = take_up.mean() + + # Should be within 5 percentage points of the rate + assert abs(actual_proportion - rate) < 0.05 + + def test_boolean_generation(self): + """Take-up decisions should be boolean.""" + rate = 0.5 + n = 100 + generator = np.random.default_rng(seed=42) + + take_up = generator.random(n) < rate + + assert take_up.dtype == bool + assert set(take_up).issubset({True, False})