Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: minor
changes:
added:
- Move all randomness to data package for deterministic country package. Take-up decisions for SNAP, Medicaid, ACA, EITC, DC PTC, Head Start, and Early Head Start are now generated stochastically during dataset creation using take-up rates from YAML parameter files.
57 changes: 48 additions & 9 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from microimpute.models.qrf import QRF
import logging
from policyengine_us_data.parameters import load_take_up_rate


test_lite = os.environ.get("TEST_LITE") == "true"
Expand Down Expand Up @@ -207,25 +208,63 @@ def add_takeup(self):
from policyengine_us import system, Microsimulation

baseline = Microsimulation(dataset=self)
parameters = baseline.tax_benefit_system.parameters(self.time_period)

# Generate all stochastic take-up decisions using take-up rates from parameter files
# This keeps the country package purely deterministic
generator = np.random.default_rng(seed=100)

eitc_takeup_rates = parameters.gov.irs.credits.eitc.takeup
# Load take-up rates from parameter files
eitc_rates_by_children = load_take_up_rate("eitc", self.time_period)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The eitc YAML has no values with dates, so what happens if someone adds time-varying EITC rates? Claude thinks it would silently ignore the date:

  If someone updated eitc.yaml to:

  rates_by_children:
    0: 0.65
    1: 0.86
    2: 0.85
    3: 0.85
  values:
    2018-01-01: 0.80
    2025-01-01: 0.90

  The function would still hit this first and return early:

  if "rates_by_children" in data:
      return data["rates_by_children"]  # never reaches the values lookup

  So the dated values would be ignored entirely. 

dc_ptc_rate = load_take_up_rate("dc_ptc", self.time_period)
snap_rate = load_take_up_rate("snap", self.time_period)
aca_rate = load_take_up_rate("aca", self.time_period)
medicaid_rate = load_take_up_rate("medicaid", self.time_period)
head_start_rate = load_take_up_rate("head_start", self.time_period)
early_head_start_rate = load_take_up_rate(
"early_head_start", self.time_period
)

# EITC: varies by number of children
eitc_child_count = baseline.calculate("eitc_child_count").values
eitc_takeup_rate = eitc_takeup_rates.calc(eitc_child_count)
eitc_takeup_rate = np.array(
[
eitc_rates_by_children.get(min(int(c), 3), 0.85)
for c in eitc_child_count
]
)
data["takes_up_eitc"] = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Each call to generator.random() advances the RNG state. So the order of these calls matters. If someone reorders these lines, or inserts a new program in the middle, all downstream assignments get different random draws.

generator.random(len(data["tax_unit_id"])) < eitc_takeup_rate
)
dc_ptc_takeup_rate = parameters.gov.states.dc.tax.income.credits.ptc.takeup

# DC Property Tax Credit
data["takes_up_dc_ptc"] = (
generator.random(len(data["tax_unit_id"])) < dc_ptc_takeup_rate
generator.random(len(data["tax_unit_id"])) < dc_ptc_rate
)

# SNAP
data["takes_up_snap_if_eligible"] = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noting that all these are independent. Probably in the real world, takeups are correlated, but I understand that may be beyond the scope of the engine in 2025.

OTOH, we could think about what the latent dimensions are and then create "loadings" from, say 3 random, even correlated values to the different take up seeds. Probably not worth it now, and we could always extend to that later, but just throwing it out there.

generator.random(len(data["spm_unit_id"])) < snap_rate
)

# ACA
data["takes_up_aca_if_eligible"] = (
generator.random(len(data["tax_unit_id"])) < aca_rate
)

# Medicaid
data["takes_up_medicaid_if_eligible"] = (
generator.random(len(data["person_id"])) < medicaid_rate
)

# Head Start
data["takes_up_head_start_if_eligible"] = (
generator.random(len(data["person_id"])) < head_start_rate
)
generator = np.random.default_rng(seed=100)

data["snap_take_up_seed"] = generator.random(len(data["spm_unit_id"]))
data["aca_take_up_seed"] = generator.random(len(data["tax_unit_id"]))
data["medicaid_take_up_seed"] = generator.random(len(data["person_id"]))
# Early Head Start
data["takes_up_early_head_start_if_eligible"] = (
generator.random(len(data["person_id"])) < early_head_start_rate
)

self.save_dataset(data)

Expand Down
56 changes: 56 additions & 0 deletions policyengine_us_data/parameters/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""
Take-up rate parameters for stochastic simulation.
These parameters are stored in the data package to keep the country package
as a purely deterministic rules engine.
"""

import yaml
from pathlib import Path

PARAMETERS_DIR = Path(__file__).parent


def load_take_up_rate(variable_name: str, year: int = 2018) -> float:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why duplicate rather than import? The data package already depends on policyengine-us. Why not just read the parameters from there?

"""Load take-up rate from YAML parameter files.
Args:
variable_name: Name of the take-up parameter file (without .yaml)
year: Year for which to get the rate
Returns:
Take-up rate as a float between 0 and 1
"""
yaml_path = PARAMETERS_DIR / "take_up" / f"{variable_name}.yaml"

with open(yaml_path) as f:
data = yaml.safe_load(f)

# Handle EITC special case (has rates_by_children instead of values)
if "rates_by_children" in data:
return data["rates_by_children"] # Return the dict

# Find the applicable value for the year
values = data["values"]
applicable_value = None

for date_key, value in sorted(values.items()):
# Handle both string and datetime.date objects from YAML
if hasattr(date_key, "year"):
# It's a datetime.date object
date_year = date_key.year
else:
# It's a string
date_year = int(date_key.split("-")[0])

if date_year <= year:
applicable_value = value
else:
break

if applicable_value is None:
raise ValueError(
f"No take-up rate found for {variable_name} in {year}"
)

return applicable_value
10 changes: 10 additions & 0 deletions policyengine_us_data/parameters/take_up/aca.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
description: Percentage of eligible people who do enroll in Affordable Care Act coverage, if eligible.
metadata:
label: ACA takeup rate
unit: /1
period: year
reference:
- title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP"
href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the
values:
2018-01-01: 0.672
11 changes: 11 additions & 0 deletions policyengine_us_data/parameters/take_up/dc_ptc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
description: The share of eligible individuals who claim the DC property tax credit.
metadata:
unit: /1
label: DC property tax credit takeup rate
period: year
reference:
- title: District of Columbia Tax Expenditure Report, 2024
href: https://ora-cfo.dc.gov/sites/default/files/dc/sites/ora-cfo/publication/attachments/2024%20Tax%20Expenditure%20Report.pdf#page=234
values:
# 37,133 (from 2024 Tax Expenditure Report) / 131,791,388 (PolicyEngine DC PTC value estimate)
2021-01-01: 0.32
9 changes: 9 additions & 0 deletions policyengine_us_data/parameters/take_up/early_head_start.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
description: Percentage of eligible infants and toddlers who enroll in Early Head Start.
metadata:
label: Early Head Start take-up rate
unit: /1
reference:
- title: NIEER State(s) of Head Start and Early Head Start Report
href: https://nieer.org/research-library/states-head-start-early-head-start
values:
2020-09-01: 0.09
12 changes: 12 additions & 0 deletions policyengine_us_data/parameters/take_up/eitc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
description: The share of eligible individuals who claim the EITC (by number of children).
metadata:
label: EITC take-up rate by number of children
reference:
- title: National Taxpayer Advocate Special Report to Congress 2020 | IRS
href: https://www.taxpayeradvocate.irs.gov/wp-content/uploads/2020/08/JRC20_Volume3.pdf#page=62
# Maps number of children to take-up rate
rates_by_children:
0: 0.65
1: 0.86
2: 0.85
3: 0.85 # Assume same as 2
10 changes: 10 additions & 0 deletions policyengine_us_data/parameters/take_up/head_start.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
description: Percentage of eligible children who enroll in Head Start.
metadata:
label: Head Start take-up rate
unit: /1
reference:
- title: NIEER State(s) of Head Start and Early Head Start Report
href: https://nieer.org/research-library/states-head-start-early-head-start
values:
2020-09-01: 0.40
2021-09-01: 0.30
10 changes: 10 additions & 0 deletions policyengine_us_data/parameters/take_up/medicaid.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
description: Percentage of people who do enroll in Medicaid, if eligible.
metadata:
label: Medicaid takeup rate
unit: /1
period: year
reference:
- title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP"
href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the
values:
2018-01-01: 0.93
9 changes: 9 additions & 0 deletions policyengine_us_data/parameters/take_up/snap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
description: Percentage of eligible SNAP recipients who claim SNAP.
metadata:
label: SNAP takeup rate
unit: /1
reference:
- title: USDA
href: https://www.fns.usda.gov/usamap
values:
2018-01-01: 0.82
110 changes: 110 additions & 0 deletions policyengine_us_data/tests/test_stochastic_variables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""Tests for stochastic variable generation in the data package.

These tests verify that:
1. Take-up rate parameters load correctly
2. Seeded RNG produces deterministic results
3. Take-up rates produce plausible proportions
"""

import pytest
import numpy as np
from policyengine_us_data.parameters import load_take_up_rate


class TestTakeUpRateParameters:
"""Test that take-up rate parameters load correctly."""

def test_eitc_rate_loads(self):
"""EITC take-up rates should load and be plausible."""
rates = load_take_up_rate("eitc", 2022)
# EITC rates are by number of children: 0, 1, 2, 3+
assert isinstance(rates, dict) or isinstance(rates, float)
if isinstance(rates, dict):
for key, rate in rates.items():
assert 0 < rate <= 1

def test_snap_rate_loads(self):
"""SNAP take-up rate should load and be plausible."""
rate = load_take_up_rate("snap", 2022)
assert 0 < rate <= 1

def test_medicaid_rate_loads(self):
"""Medicaid take-up rate should load and be plausible."""
rate = load_take_up_rate("medicaid", 2022)
assert 0 < rate <= 1

def test_aca_rate_loads(self):
"""ACA take-up rate should load and be plausible."""
rate = load_take_up_rate("aca", 2022)
assert 0 < rate <= 1

def test_head_start_rate_loads(self):
"""Head Start take-up rate should load and be plausible."""
rate = load_take_up_rate("head_start", 2022)
assert 0 < rate <= 1

def test_early_head_start_rate_loads(self):
"""Early Head Start take-up rate should load and be plausible."""
rate = load_take_up_rate("early_head_start", 2022)
assert 0 < rate <= 1

def test_dc_ptc_rate_loads(self):
"""DC PTC take-up rate should load and be plausible."""
rate = load_take_up_rate("dc_ptc", 2022)
assert 0 < rate <= 1


class TestSeededRandomness:
"""Test that stochastic generation is deterministic."""

def test_same_seed_produces_same_results(self):
"""Using the same seed should produce identical results."""
seed = 0
n = 1_000

generator1 = np.random.default_rng(seed=seed)
result1 = generator1.random(n)

generator2 = np.random.default_rng(seed=seed)
result2 = generator2.random(n)

np.testing.assert_array_equal(result1, result2)

def test_different_seeds_produce_different_results(self):
"""Different seeds should produce different results."""
n = 1_000

generator1 = np.random.default_rng(seed=0)
result1 = generator1.random(n)

generator2 = np.random.default_rng(seed=1)
result2 = generator2.random(n)

assert not np.array_equal(result1, result2)


class TestTakeUpProportions:
"""Test that take-up rates produce plausible proportions."""

def test_take_up_produces_expected_proportion(self):
"""Simulated take-up should match the rate approximately."""
rate = 0.7
n = 10_000
generator = np.random.default_rng(seed=42)

take_up = generator.random(n) < rate
actual_proportion = take_up.mean()

# Should be within 5 percentage points of the rate
assert abs(actual_proportion - rate) < 0.05

def test_boolean_generation(self):
"""Take-up decisions should be boolean."""
rate = 0.5
n = 100
generator = np.random.default_rng(seed=42)

take_up = generator.random(n) < rate

assert take_up.dtype == bool
assert set(take_up).issubset({True, False})