-
Notifications
You must be signed in to change notification settings - Fork 10
Move all randomness to data package for deterministic country package #451
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
027a273
39fca44
a502f1a
4b70d64
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| - bump: minor | ||
| changes: | ||
| added: | ||
| - Move all randomness to data package for deterministic country package. Take-up decisions for SNAP, Medicaid, ACA, EITC, DC PTC, Head Start, and Early Head Start are now generated stochastically during dataset creation using take-up rates from YAML parameter files. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,7 @@ | |
| ) | ||
| from microimpute.models.qrf import QRF | ||
| import logging | ||
| from policyengine_us_data.parameters import load_take_up_rate | ||
|
|
||
|
|
||
| test_lite = os.environ.get("TEST_LITE") == "true" | ||
|
|
@@ -207,25 +208,63 @@ def add_takeup(self): | |
| from policyengine_us import system, Microsimulation | ||
|
|
||
| baseline = Microsimulation(dataset=self) | ||
| parameters = baseline.tax_benefit_system.parameters(self.time_period) | ||
|
|
||
| # Generate all stochastic take-up decisions using take-up rates from parameter files | ||
| # This keeps the country package purely deterministic | ||
| generator = np.random.default_rng(seed=100) | ||
|
|
||
| eitc_takeup_rates = parameters.gov.irs.credits.eitc.takeup | ||
| # Load take-up rates from parameter files | ||
| eitc_rates_by_children = load_take_up_rate("eitc", self.time_period) | ||
| dc_ptc_rate = load_take_up_rate("dc_ptc", self.time_period) | ||
| snap_rate = load_take_up_rate("snap", self.time_period) | ||
| aca_rate = load_take_up_rate("aca", self.time_period) | ||
| medicaid_rate = load_take_up_rate("medicaid", self.time_period) | ||
| head_start_rate = load_take_up_rate("head_start", self.time_period) | ||
| early_head_start_rate = load_take_up_rate( | ||
| "early_head_start", self.time_period | ||
| ) | ||
|
|
||
| # EITC: varies by number of children | ||
| eitc_child_count = baseline.calculate("eitc_child_count").values | ||
| eitc_takeup_rate = eitc_takeup_rates.calc(eitc_child_count) | ||
| eitc_takeup_rate = np.array( | ||
| [ | ||
| eitc_rates_by_children.get(min(int(c), 3), 0.85) | ||
| for c in eitc_child_count | ||
| ] | ||
| ) | ||
| data["takes_up_eitc"] = ( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Each call to generator.random() advances the RNG state. So the order of these calls matters. If someone reorders these lines, or inserts a new program in the middle, all downstream assignments get different random draws. |
||
| generator.random(len(data["tax_unit_id"])) < eitc_takeup_rate | ||
| ) | ||
| dc_ptc_takeup_rate = parameters.gov.states.dc.tax.income.credits.ptc.takeup | ||
|
|
||
| # DC Property Tax Credit | ||
| data["takes_up_dc_ptc"] = ( | ||
| generator.random(len(data["tax_unit_id"])) < dc_ptc_takeup_rate | ||
| generator.random(len(data["tax_unit_id"])) < dc_ptc_rate | ||
| ) | ||
|
|
||
| # SNAP | ||
| data["takes_up_snap_if_eligible"] = ( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just noting that all these are independent. Probably in the real world, takeups are correlated, but I understand that may be beyond the scope of the engine in 2025. OTOH, we could think about what the latent dimensions are and then create "loadings" from, say 3 random, even correlated values to the different take up seeds. Probably not worth it now, and we could always extend to that later, but just throwing it out there. |
||
| generator.random(len(data["spm_unit_id"])) < snap_rate | ||
| ) | ||
|
|
||
| # ACA | ||
| data["takes_up_aca_if_eligible"] = ( | ||
| generator.random(len(data["tax_unit_id"])) < aca_rate | ||
| ) | ||
|
|
||
| # Medicaid | ||
| data["takes_up_medicaid_if_eligible"] = ( | ||
| generator.random(len(data["person_id"])) < medicaid_rate | ||
| ) | ||
|
|
||
| # Head Start | ||
| data["takes_up_head_start_if_eligible"] = ( | ||
| generator.random(len(data["person_id"])) < head_start_rate | ||
| ) | ||
| generator = np.random.default_rng(seed=100) | ||
|
|
||
| data["snap_take_up_seed"] = generator.random(len(data["spm_unit_id"])) | ||
| data["aca_take_up_seed"] = generator.random(len(data["tax_unit_id"])) | ||
| data["medicaid_take_up_seed"] = generator.random(len(data["person_id"])) | ||
| # Early Head Start | ||
| data["takes_up_early_head_start_if_eligible"] = ( | ||
| generator.random(len(data["person_id"])) < early_head_start_rate | ||
| ) | ||
|
|
||
| self.save_dataset(data) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| """ | ||
| Take-up rate parameters for stochastic simulation. | ||
| These parameters are stored in the data package to keep the country package | ||
| as a purely deterministic rules engine. | ||
| """ | ||
|
|
||
| import yaml | ||
| from pathlib import Path | ||
|
|
||
| PARAMETERS_DIR = Path(__file__).parent | ||
|
|
||
|
|
||
| def load_take_up_rate(variable_name: str, year: int = 2018) -> float: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why duplicate rather than import? The data package already depends on policyengine-us. Why not just read the parameters from there? |
||
| """Load take-up rate from YAML parameter files. | ||
| Args: | ||
| variable_name: Name of the take-up parameter file (without .yaml) | ||
| year: Year for which to get the rate | ||
| Returns: | ||
| Take-up rate as a float between 0 and 1 | ||
| """ | ||
| yaml_path = PARAMETERS_DIR / "take_up" / f"{variable_name}.yaml" | ||
|
|
||
| with open(yaml_path) as f: | ||
| data = yaml.safe_load(f) | ||
|
|
||
| # Handle EITC special case (has rates_by_children instead of values) | ||
| if "rates_by_children" in data: | ||
| return data["rates_by_children"] # Return the dict | ||
|
|
||
| # Find the applicable value for the year | ||
| values = data["values"] | ||
| applicable_value = None | ||
|
|
||
| for date_key, value in sorted(values.items()): | ||
| # Handle both string and datetime.date objects from YAML | ||
| if hasattr(date_key, "year"): | ||
| # It's a datetime.date object | ||
| date_year = date_key.year | ||
| else: | ||
| # It's a string | ||
| date_year = int(date_key.split("-")[0]) | ||
|
|
||
| if date_year <= year: | ||
| applicable_value = value | ||
| else: | ||
| break | ||
|
|
||
| if applicable_value is None: | ||
| raise ValueError( | ||
| f"No take-up rate found for {variable_name} in {year}" | ||
| ) | ||
|
|
||
| return applicable_value | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| description: Percentage of eligible people who do enroll in Affordable Care Act coverage, if eligible. | ||
| metadata: | ||
| label: ACA takeup rate | ||
| unit: /1 | ||
| period: year | ||
| reference: | ||
| - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP" | ||
| href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the | ||
| values: | ||
| 2018-01-01: 0.672 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| description: The share of eligible individuals who claim the DC property tax credit. | ||
| metadata: | ||
| unit: /1 | ||
| label: DC property tax credit takeup rate | ||
| period: year | ||
| reference: | ||
| - title: District of Columbia Tax Expenditure Report, 2024 | ||
| href: https://ora-cfo.dc.gov/sites/default/files/dc/sites/ora-cfo/publication/attachments/2024%20Tax%20Expenditure%20Report.pdf#page=234 | ||
| values: | ||
| # 37,133 (from 2024 Tax Expenditure Report) / 131,791,388 (PolicyEngine DC PTC value estimate) | ||
| 2021-01-01: 0.32 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| description: Percentage of eligible infants and toddlers who enroll in Early Head Start. | ||
| metadata: | ||
| label: Early Head Start take-up rate | ||
| unit: /1 | ||
| reference: | ||
| - title: NIEER State(s) of Head Start and Early Head Start Report | ||
| href: https://nieer.org/research-library/states-head-start-early-head-start | ||
| values: | ||
| 2020-09-01: 0.09 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| description: The share of eligible individuals who claim the EITC (by number of children). | ||
| metadata: | ||
| label: EITC take-up rate by number of children | ||
| reference: | ||
| - title: National Taxpayer Advocate Special Report to Congress 2020 | IRS | ||
| href: https://www.taxpayeradvocate.irs.gov/wp-content/uploads/2020/08/JRC20_Volume3.pdf#page=62 | ||
| # Maps number of children to take-up rate | ||
| rates_by_children: | ||
| 0: 0.65 | ||
| 1: 0.86 | ||
| 2: 0.85 | ||
| 3: 0.85 # Assume same as 2 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| description: Percentage of eligible children who enroll in Head Start. | ||
| metadata: | ||
| label: Head Start take-up rate | ||
| unit: /1 | ||
| reference: | ||
| - title: NIEER State(s) of Head Start and Early Head Start Report | ||
| href: https://nieer.org/research-library/states-head-start-early-head-start | ||
| values: | ||
| 2020-09-01: 0.40 | ||
| 2021-09-01: 0.30 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| description: Percentage of people who do enroll in Medicaid, if eligible. | ||
| metadata: | ||
| label: Medicaid takeup rate | ||
| unit: /1 | ||
| period: year | ||
| reference: | ||
| - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP" | ||
| href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the | ||
| values: | ||
| 2018-01-01: 0.93 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| description: Percentage of eligible SNAP recipients who claim SNAP. | ||
| metadata: | ||
| label: SNAP takeup rate | ||
| unit: /1 | ||
| reference: | ||
| - title: USDA | ||
| href: https://www.fns.usda.gov/usamap | ||
| values: | ||
| 2018-01-01: 0.82 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,110 @@ | ||
| """Tests for stochastic variable generation in the data package. | ||
|
|
||
| These tests verify that: | ||
| 1. Take-up rate parameters load correctly | ||
| 2. Seeded RNG produces deterministic results | ||
| 3. Take-up rates produce plausible proportions | ||
| """ | ||
|
|
||
| import pytest | ||
| import numpy as np | ||
| from policyengine_us_data.parameters import load_take_up_rate | ||
|
|
||
|
|
||
| class TestTakeUpRateParameters: | ||
| """Test that take-up rate parameters load correctly.""" | ||
|
|
||
| def test_eitc_rate_loads(self): | ||
| """EITC take-up rates should load and be plausible.""" | ||
| rates = load_take_up_rate("eitc", 2022) | ||
| # EITC rates are by number of children: 0, 1, 2, 3+ | ||
| assert isinstance(rates, dict) or isinstance(rates, float) | ||
| if isinstance(rates, dict): | ||
| for key, rate in rates.items(): | ||
| assert 0 < rate <= 1 | ||
|
|
||
| def test_snap_rate_loads(self): | ||
| """SNAP take-up rate should load and be plausible.""" | ||
| rate = load_take_up_rate("snap", 2022) | ||
| assert 0 < rate <= 1 | ||
|
|
||
| def test_medicaid_rate_loads(self): | ||
| """Medicaid take-up rate should load and be plausible.""" | ||
| rate = load_take_up_rate("medicaid", 2022) | ||
| assert 0 < rate <= 1 | ||
|
|
||
| def test_aca_rate_loads(self): | ||
| """ACA take-up rate should load and be plausible.""" | ||
| rate = load_take_up_rate("aca", 2022) | ||
| assert 0 < rate <= 1 | ||
|
|
||
| def test_head_start_rate_loads(self): | ||
| """Head Start take-up rate should load and be plausible.""" | ||
| rate = load_take_up_rate("head_start", 2022) | ||
| assert 0 < rate <= 1 | ||
|
|
||
| def test_early_head_start_rate_loads(self): | ||
| """Early Head Start take-up rate should load and be plausible.""" | ||
| rate = load_take_up_rate("early_head_start", 2022) | ||
| assert 0 < rate <= 1 | ||
|
|
||
| def test_dc_ptc_rate_loads(self): | ||
| """DC PTC take-up rate should load and be plausible.""" | ||
| rate = load_take_up_rate("dc_ptc", 2022) | ||
| assert 0 < rate <= 1 | ||
|
|
||
|
|
||
| class TestSeededRandomness: | ||
| """Test that stochastic generation is deterministic.""" | ||
|
|
||
| def test_same_seed_produces_same_results(self): | ||
| """Using the same seed should produce identical results.""" | ||
| seed = 0 | ||
| n = 1_000 | ||
|
|
||
| generator1 = np.random.default_rng(seed=seed) | ||
| result1 = generator1.random(n) | ||
|
|
||
| generator2 = np.random.default_rng(seed=seed) | ||
| result2 = generator2.random(n) | ||
|
|
||
| np.testing.assert_array_equal(result1, result2) | ||
|
|
||
| def test_different_seeds_produce_different_results(self): | ||
| """Different seeds should produce different results.""" | ||
| n = 1_000 | ||
|
|
||
| generator1 = np.random.default_rng(seed=0) | ||
| result1 = generator1.random(n) | ||
|
|
||
| generator2 = np.random.default_rng(seed=1) | ||
| result2 = generator2.random(n) | ||
|
|
||
| assert not np.array_equal(result1, result2) | ||
|
|
||
|
|
||
| class TestTakeUpProportions: | ||
| """Test that take-up rates produce plausible proportions.""" | ||
|
|
||
| def test_take_up_produces_expected_proportion(self): | ||
| """Simulated take-up should match the rate approximately.""" | ||
| rate = 0.7 | ||
| n = 10_000 | ||
| generator = np.random.default_rng(seed=42) | ||
|
|
||
| take_up = generator.random(n) < rate | ||
| actual_proportion = take_up.mean() | ||
|
|
||
| # Should be within 5 percentage points of the rate | ||
| assert abs(actual_proportion - rate) < 0.05 | ||
|
|
||
| def test_boolean_generation(self): | ||
| """Take-up decisions should be boolean.""" | ||
| rate = 0.5 | ||
| n = 100 | ||
| generator = np.random.default_rng(seed=42) | ||
|
|
||
| take_up = generator.random(n) < rate | ||
|
|
||
| assert take_up.dtype == bool | ||
| assert set(take_up).issubset({True, False}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The eitc YAML has no values with dates, so what happens if someone adds time-varying EITC rates? Claude thinks it would silently ignore the date: