diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..d61b05c8 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Move all randomness to data package for deterministic country package. Take-up decisions are now generated stochastically during dataset creation using take-up rates from YAML parameter files. diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index e9fb53aa..b9970faf 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -19,6 +19,7 @@ fill_with_mean, STORAGE_FOLDER, ) +from policyengine_uk_data.parameters import load_take_up_rate, load_parameter def create_frs( @@ -818,48 +819,96 @@ def determine_education_level(fted_val, typeed2_val, age_val): paragraph_3 | paragraph_4 | paragraph_5 ) - # Add random variables which are for now in policyengine-uk. + # Generate stochastic take-up decisions + # All randomness is generated here in the data package using take-up rates + # stored in YAML parameter files. This keeps the country package purely + # deterministic. - RANDOM_VARIABLES = [ - "would_evade_tv_licence_fee", - "would_claim_pc", - "would_claim_uc", - "would_claim_child_benefit", - "main_residential_property_purchased_is_first_home", - "household_owns_tv", - "is_higher_earner", - "attends_private_school", - ] + generator = np.random.default_rng(seed=100) - for variable in RANDOM_VARIABLES: - value = sim.calculate(variable).values - entity = sim.tax_benefit_system.variables[variable].entity.key - if entity == "person": - pe_person[variable] = value - elif entity == "household": - pe_household[variable] = value - elif entity == "benunit": - pe_benunit[variable] = value + # Load take-up rates from parameter files + child_benefit_rate = load_take_up_rate("child_benefit", year) + pension_credit_rate = load_take_up_rate("pension_credit", year) + universal_credit_rate = load_take_up_rate("universal_credit", year) + marriage_allowance_rate = load_take_up_rate("marriage_allowance", year) + child_benefit_opts_out_rate = load_take_up_rate( + "child_benefit_opts_out_rate", year + ) + tfc_rate = load_take_up_rate("tax_free_childcare", year) + extended_childcare_rate = load_take_up_rate("extended_childcare", year) + universal_childcare_rate = load_take_up_rate("universal_childcare", year) + targeted_childcare_rate = load_take_up_rate("targeted_childcare", year) - # Add Tax-Free Childcare assumptions + # Generate take-up decisions by comparing random draws to take-up rates + # Person-level + pe_person["would_claim_marriage_allowance"] = ( + generator.random(len(pe_person)) < marriage_allowance_rate + ) - count_benunits = len(pe_benunit) + # Benefit unit-level + pe_benunit["would_claim_child_benefit"] = ( + generator.random(len(pe_benunit)) < child_benefit_rate + ) + pe_benunit["child_benefit_opts_out"] = ( + generator.random(len(pe_benunit)) < child_benefit_opts_out_rate + ) + pe_benunit["would_claim_pc"] = ( + generator.random(len(pe_benunit)) < pension_credit_rate + ) + pe_benunit["would_claim_uc"] = ( + generator.random(len(pe_benunit)) < universal_credit_rate + ) + pe_benunit["would_claim_tfc"] = ( + generator.random(len(pe_benunit)) < tfc_rate + ) + pe_benunit["would_claim_extended_childcare"] = ( + generator.random(len(pe_benunit)) < extended_childcare_rate + ) + pe_benunit["would_claim_universal_childcare"] = ( + generator.random(len(pe_benunit)) < universal_childcare_rate + ) + pe_benunit["would_claim_targeted_childcare"] = ( + generator.random(len(pe_benunit)) < targeted_childcare_rate + ) - extended_would_claim = np.random.random(count_benunits) < 0.812 - tfc_would_claim = np.random.random(count_benunits) < 0.586 - universal_would_claim = np.random.random(count_benunits) < 0.563 - targeted_would_claim = np.random.random(count_benunits) < 0.597 + # Generate other stochastic variables using rates from parameter files + tv_ownership_rate = load_parameter("stochastic", "tv_ownership_rate", year) + tv_evasion_rate = load_parameter( + "stochastic", "tv_licence_evasion_rate", year + ) + first_time_buyer_rate = load_parameter( + "stochastic", "first_time_buyer_rate", year + ) + + # Household-level: TV ownership + pe_household["household_owns_tv"] = ( + generator.random(len(pe_household)) < tv_ownership_rate + ) - # Generate extended childcare hours usage values with mean 15.019 and sd 4.972 - extended_hours_values = np.random.normal(15.019, 4.972, count_benunits) + # Household-level: TV licence evasion + pe_household["would_evade_tv_licence_fee"] = ( + generator.random(len(pe_household)) < tv_evasion_rate + ) + + # Household-level: First home purchase + pe_household["main_residential_property_purchased_is_first_home"] = ( + generator.random(len(pe_household)) < first_time_buyer_rate + ) + + # Person-level: Tie-breaking for higher earner (uniform random) + pe_person["higher_earner_tie_break"] = generator.random(len(pe_person)) + + # Person-level: Private school attendance random draw + pe_person["attends_private_school_random_draw"] = generator.random( + len(pe_person) + ) + + # Generate extended childcare hours usage values with mean 15.019 and sd + # 4.972 + extended_hours_values = generator.normal(15.019, 4.972, len(pe_benunit)) # Clip values to be between 0 and 30 hours extended_hours_values = np.clip(extended_hours_values, 0, 30) - pe_benunit["would_claim_extended_childcare"] = extended_would_claim - pe_benunit["would_claim_tfc"] = tfc_would_claim - pe_benunit["would_claim_universal_childcare"] = universal_would_claim - pe_benunit["would_claim_targeted_childcare"] = targeted_would_claim - # Add the maximum extended childcare hours usage pe_benunit["maximum_extended_childcare_hours_usage"] = ( extended_hours_values diff --git a/policyengine_uk_data/parameters/__init__.py b/policyengine_uk_data/parameters/__init__.py new file mode 100644 index 00000000..dcf981ba --- /dev/null +++ b/policyengine_uk_data/parameters/__init__.py @@ -0,0 +1,68 @@ +""" +Take-up rate parameters for stochastic simulation. + +These parameters are stored in the data package to keep the country package +as a purely deterministic rules engine. +""" + +import yaml +from pathlib import Path + +PARAMETERS_DIR = Path(__file__).parent + + +def load_parameter( + category: str, variable_name: str, year: int = 2015 +) -> float: + """Load parameter from YAML files in a specific category. + + Args: + category: Category subfolder (e.g., 'take_up', 'stochastic') + variable_name: Name of the parameter file (without .yaml) + year: Year for which to get the value + + Returns: + Parameter value as a float + """ + yaml_path = PARAMETERS_DIR / category / f"{variable_name}.yaml" + + with open(yaml_path) as f: + data = yaml.safe_load(f) + + # Find the applicable value for the year + values = data["values"] + applicable_value = None + + for date_key, value in sorted(values.items()): + # Handle both string and datetime.date objects from YAML + if hasattr(date_key, "year"): + # It's a datetime.date object + date_year = date_key.year + else: + # It's a string + date_year = int(date_key.split("-")[0]) + + if date_year <= year: + applicable_value = value + else: + break + + if applicable_value is None: + raise ValueError( + f"No value found for {category}/{variable_name} in {year}" + ) + + return applicable_value + + +def load_take_up_rate(variable_name: str, year: int = 2015) -> float: + """Load take-up rate from YAML parameter files. + + Args: + variable_name: Name of the take-up parameter file (without .yaml) + year: Year for which to get the rate + + Returns: + Take-up rate as a float between 0 and 1 + """ + return load_parameter("take_up", variable_name, year) diff --git a/policyengine_uk_data/parameters/stochastic/first_time_buyer_rate.yaml b/policyengine_uk_data/parameters/stochastic/first_time_buyer_rate.yaml new file mode 100644 index 00000000..76aaff61 --- /dev/null +++ b/policyengine_uk_data/parameters/stochastic/first_time_buyer_rate.yaml @@ -0,0 +1,12 @@ +description: Percentage of residential property purchases that are by first-time buyers +metadata: + unit: /1 + label: First-time buyer rate + reference: + - title: ONS First-time buyer mortgage sales by local authority + href: https://www.ons.gov.uk/releases/firsttimebuyermortgagesalesbylocalauthorityuk2006to2023 + - title: Uswitch First-Time Buyer Statistics 2024 + href: https://www.uswitch.com/mortgages/first-time-buyer-statistics/ +values: + 2013-01-01: 0.280 # ONS data + 2023-01-01: 0.384 # 38.4% of property sales were first-time buyers diff --git a/policyengine_uk_data/parameters/stochastic/tv_licence_evasion_rate.yaml b/policyengine_uk_data/parameters/stochastic/tv_licence_evasion_rate.yaml new file mode 100644 index 00000000..0930a3ae --- /dev/null +++ b/policyengine_uk_data/parameters/stochastic/tv_licence_evasion_rate.yaml @@ -0,0 +1,14 @@ +description: Percentage of TV-owning households that evade the TV licence fee +metadata: + unit: /1 + label: TV licence evasion rate + reference: + - title: TV Licensing annual evader statistics + href: https://www.tvlicensing.co.uk/about/media-centre/news/tv-licensing-publishes-annual-evader-statistics-NEWS31 + - title: House of Commons Library - TV licence fee statistics + href: https://commonslibrary.parliament.uk/research-briefings/cbp-8101/ +values: + 2015-01-01: 0.05 # Historical low point + 2018-01-01: 0.0657 # Official BBC estimate + 2022-01-01: 0.1058 # Significant increase + 2024-01-01: 0.1252 # Current BBC estimate diff --git a/policyengine_uk_data/parameters/stochastic/tv_ownership_rate.yaml b/policyengine_uk_data/parameters/stochastic/tv_ownership_rate.yaml new file mode 100644 index 00000000..5880d629 --- /dev/null +++ b/policyengine_uk_data/parameters/stochastic/tv_ownership_rate.yaml @@ -0,0 +1,10 @@ +description: Percentage of households that own a functioning colour TV +metadata: + unit: /1 + label: TV ownership rate + reference: + - title: Ofcom - 95% of UK homes had at least one TV set in 2020 + href: https://www.statista.com/statistics/269969/number-of-tv-households-in-the-uk/ +values: + 2015-01-01: 0.96 + 2020-01-01: 0.95 diff --git a/policyengine_uk_data/parameters/take_up/child_benefit.yaml b/policyengine_uk_data/parameters/take_up/child_benefit.yaml new file mode 100644 index 00000000..b3779722 --- /dev/null +++ b/policyengine_uk_data/parameters/take_up/child_benefit.yaml @@ -0,0 +1,9 @@ +description: Share of eligible children that participate in Child Benefit +metadata: + unit: /1 + reference: + - title: "Child Benefit statistics: 2022 annual release" + href: https://www.gov.uk/government/statistics/child-benefit-statistics-annual-release-august-2022/child-benefit-statistics-annual-release-data-at-august-2022#:~:text=since%202012%20the%20take%2Dup,level%20in%202022%20of%2089%25. +values: + 2012-01-01: 0.97 + 2022-01-01: 0.89 diff --git a/policyengine_uk_data/parameters/take_up/child_benefit_opts_out_rate.yaml b/policyengine_uk_data/parameters/take_up/child_benefit_opts_out_rate.yaml new file mode 100644 index 00000000..5a78e8d3 --- /dev/null +++ b/policyengine_uk_data/parameters/take_up/child_benefit_opts_out_rate.yaml @@ -0,0 +1,9 @@ +description: Percentage of fully High Income Child Benefit Charge-liable families who opt out of Child Benefit. +metadata: + unit: /1 + label: Child Benefit HITC-liable opt-out rate + reference: + - title: "Child Benefit Statistics: Annual Release, August 2022" + href: https://www.gov.uk/government/statistics/child-benefit-statistics-annual-release-august-2022/child-benefit-statistics-annual-release-data-at-august-2022 +values: + 2019-01-01: 0.23 # 3m families have ANI over £60k in the 2023 FRS, 683k families opt out of CB. diff --git a/policyengine_uk_data/parameters/take_up/extended_childcare.yaml b/policyengine_uk_data/parameters/take_up/extended_childcare.yaml new file mode 100644 index 00000000..f8e27d48 --- /dev/null +++ b/policyengine_uk_data/parameters/take_up/extended_childcare.yaml @@ -0,0 +1,9 @@ +description: Extended childcare entitlement take-up rate +metadata: + unit: /1 + period: year + reference: + - title: Empirical estimate from FRS data + href: https://github.com/PolicyEngine/policyengine-uk-data +values: + 2015-01-01: 0.812 diff --git a/policyengine_uk_data/parameters/take_up/marriage_allowance.yaml b/policyengine_uk_data/parameters/take_up/marriage_allowance.yaml new file mode 100644 index 00000000..ff528bba --- /dev/null +++ b/policyengine_uk_data/parameters/take_up/marriage_allowance.yaml @@ -0,0 +1,6 @@ +description: Percentage of eligible couples who claim Marriage Allowance. +metadata: + unit: /1 + label: Marriage Allowance take-up rate +values: + 2000-01-01: 1 diff --git a/policyengine_uk_data/parameters/take_up/pension_credit.yaml b/policyengine_uk_data/parameters/take_up/pension_credit.yaml new file mode 100644 index 00000000..dc31d850 --- /dev/null +++ b/policyengine_uk_data/parameters/take_up/pension_credit.yaml @@ -0,0 +1,10 @@ +description: Share of eligible Pension Credit recipients that participate. +metadata: + label: Pension Credit take-up rate + name: PC_takeup + unit: /1 + reference: + - title: "Income-related benefits: estimates of take-up: financial year 2019 to 2020" + href: https://www.gov.uk/government/statistics/income-related-benefits-estimates-of-take-up-financial-year-2019-to-2020/income-related-benefits-estimates-of-take-up-financial-year-2019-to-2020#pension-credit-2 +values: + 2015-01-01: 0.7 diff --git a/policyengine_uk_data/parameters/take_up/targeted_childcare.yaml b/policyengine_uk_data/parameters/take_up/targeted_childcare.yaml new file mode 100644 index 00000000..2693a72d --- /dev/null +++ b/policyengine_uk_data/parameters/take_up/targeted_childcare.yaml @@ -0,0 +1,9 @@ +description: Targeted childcare entitlement take-up rate +metadata: + unit: /1 + period: year + reference: + - title: Empirical estimate from FRS data + href: https://github.com/PolicyEngine/policyengine-uk-data +values: + 2015-01-01: 0.597 diff --git a/policyengine_uk_data/parameters/take_up/tax_free_childcare.yaml b/policyengine_uk_data/parameters/take_up/tax_free_childcare.yaml new file mode 100644 index 00000000..fbddc2ad --- /dev/null +++ b/policyengine_uk_data/parameters/take_up/tax_free_childcare.yaml @@ -0,0 +1,9 @@ +description: Tax-Free Childcare take-up rate +metadata: + unit: /1 + period: year + reference: + - title: Empirical estimate from FRS data + href: https://github.com/PolicyEngine/policyengine-uk-data +values: + 2015-01-01: 0.586 diff --git a/policyengine_uk_data/parameters/take_up/universal_childcare.yaml b/policyengine_uk_data/parameters/take_up/universal_childcare.yaml new file mode 100644 index 00000000..f3ac5589 --- /dev/null +++ b/policyengine_uk_data/parameters/take_up/universal_childcare.yaml @@ -0,0 +1,9 @@ +description: Universal childcare entitlement take-up rate +metadata: + unit: /1 + period: year + reference: + - title: Empirical estimate from FRS data + href: https://github.com/PolicyEngine/policyengine-uk-data +values: + 2015-01-01: 0.563 diff --git a/policyengine_uk_data/parameters/take_up/universal_credit.yaml b/policyengine_uk_data/parameters/take_up/universal_credit.yaml new file mode 100644 index 00000000..b1972ae4 --- /dev/null +++ b/policyengine_uk_data/parameters/take_up/universal_credit.yaml @@ -0,0 +1,6 @@ +description: Take-up rate of Universal Credit. +metadata: + unit: /1 + label: Universal Credit take-up rate +values: + 2015-01-01: 0.55 diff --git a/policyengine_uk_data/tests/test_stochastic_variables.py b/policyengine_uk_data/tests/test_stochastic_variables.py new file mode 100644 index 00000000..10836023 --- /dev/null +++ b/policyengine_uk_data/tests/test_stochastic_variables.py @@ -0,0 +1,118 @@ +"""Tests for stochastic variable generation in the data package. + +These tests verify that: +1. Take-up rate parameters load correctly +2. Stochastic variables are generated with correct types and ranges +3. Generation is deterministic (seeded RNG) +4. Take-up rates produce plausible proportions +""" + +import pytest +import numpy as np +from policyengine_uk_data.parameters import ( + load_take_up_rate, + load_parameter, +) + + +class TestTakeUpRateParameters: + """Test that take-up rate parameters load correctly.""" + + def test_child_benefit_rate_loads(self): + rate = load_take_up_rate("child_benefit", 2024) + assert 0 < rate <= 1 + assert rate > 0.8 # Child benefit has high take-up + + def test_pension_credit_rate_loads(self): + rate = load_take_up_rate("pension_credit", 2024) + assert 0 < rate <= 1 + + def test_universal_credit_rate_loads(self): + rate = load_take_up_rate("universal_credit", 2024) + assert 0 < rate <= 1 + + def test_marriage_allowance_rate_loads(self): + rate = load_take_up_rate("marriage_allowance", 2024) + assert 0 < rate <= 1 + + def test_child_benefit_opts_out_rate_loads(self): + rate = load_take_up_rate("child_benefit_opts_out_rate", 2024) + assert 0 <= rate <= 1 + + def test_tax_free_childcare_rate_loads(self): + rate = load_take_up_rate("tax_free_childcare", 2024) + assert 0 < rate <= 1 + + +class TestStochasticParameters: + """Test that stochastic parameters load correctly.""" + + def test_tv_ownership_rate_loads(self): + rate = load_parameter("stochastic", "tv_ownership_rate", 2024) + assert 0 < rate <= 1 + assert rate > 0.9 # Most households own TVs + + def test_tv_licence_evasion_rate_loads(self): + rate = load_parameter("stochastic", "tv_licence_evasion_rate", 2024) + assert 0 <= rate <= 1 + assert rate < 0.2 # Evasion rate should be low + + def test_first_time_buyer_rate_loads(self): + rate = load_parameter("stochastic", "first_time_buyer_rate", 2024) + assert 0 <= rate <= 1 + + +class TestSeededRandomness: + """Test that stochastic generation is deterministic.""" + + def test_same_seed_produces_same_results(self): + """Using the same seed should produce identical results.""" + seed = 100 + n = 1000 + + generator1 = np.random.default_rng(seed=seed) + result1 = generator1.random(n) + + generator2 = np.random.default_rng(seed=seed) + result2 = generator2.random(n) + + np.testing.assert_array_equal(result1, result2) + + def test_different_seeds_produce_different_results(self): + """Different seeds should produce different results.""" + n = 1000 + + generator1 = np.random.default_rng(seed=100) + result1 = generator1.random(n) + + generator2 = np.random.default_rng(seed=200) + result2 = generator2.random(n) + + assert not np.array_equal(result1, result2) + + +class TestTakeUpProportions: + """Test that take-up rates produce plausible proportions.""" + + def test_take_up_produces_expected_proportion(self): + """Simulated take-up should match the rate approximately.""" + rate = 0.7 + n = 10000 + generator = np.random.default_rng(seed=42) + + take_up = generator.random(n) < rate + actual_proportion = take_up.mean() + + # Should be within 5 percentage points of the rate + assert abs(actual_proportion - rate) < 0.05 + + def test_boolean_generation(self): + """Take-up decisions should be boolean.""" + rate = 0.5 + n = 100 + generator = np.random.default_rng(seed=42) + + take_up = generator.random(n) < rate + + assert take_up.dtype == bool + assert set(take_up).issubset({True, False})