Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
- bump: minor
changes:
fixed:
- Set property_purchased stochastically at 3.85% based on HMRC housing transaction data, fixing unrealistic SDLT charges that caused 224% tax rates in the first income decile.
added:
- Tests for property_purchased rate and SDLT total validation against official HMRC figures.
- Tests for low-income decile sanity checks to prevent negative net incomes and impossible tax rates.
20 changes: 20 additions & 0 deletions policyengine_uk_data/datasets/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,6 +869,26 @@ def determine_education_level(fted_val, typeed2_val, age_val):

pe_benunit["is_married"] = frs["benunit"].famtypb2.isin([5, 7])

# Stochastically set property_purchased based on UK housing transaction rate.
# Previously defaulted to True in policyengine-uk, causing all households
# to be charged SDLT as if they just bought their property (£370bn total).
#
# Sources:
# - Transactions: HMRC 2024 - 1.1m/year
# https://www.gov.uk/government/statistics/monthly-property-transactions-completed-in-the-uk-with-value-40000-or-above
# - Households: ONS 2024 - 28.6m
# https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/families/bulletins/familiesandhouseholds/2024
# - Rate: 1.1m / 28.6m = 3.85%
#
# Verification against official SDLT revenue (2024-25):
# - Official SDLT: £13.9bn (https://www.gov.uk/government/statistics/uk-stamp-tax-statistics)
# - With fix (3.85%): £15.7bn (close to official)
# - Without fix (100%): £370bn (26x too high)
PROPERTY_PURCHASE_RATE = 0.0385
pe_household["property_purchased"] = (
np.random.random(len(pe_household)) < PROPERTY_PURCHASE_RATE
)

dataset = UKSingleYearDataset(
person=pe_person,
benunit=pe_benunit,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ reforms:
parameters:
gov.hmrc.national_insurance.class_1.rates.employee.main: 0.1
- name: Raise VAT standard rate by 2pp
expected_impact: 19.3
expected_impact: 28.6
parameters:
gov.hmrc.vat.standard_rate: 0.22
- name: Raise additional rate by 3pp
Expand Down
153 changes: 153 additions & 0 deletions policyengine_uk_data/tests/test_low_income_deciles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""
Tests for low-income decile sanity checks.

These tests ensure that the first income decile (lowest income households)
has reasonable tax and net income values. This catches bugs like the
property_purchased issue where incorrect defaults led to:
- 224% tax rates in the first decile
- Negative net incomes for low-income households

These tests confirm the fix works and prevent similar issues in the future.
"""

import pytest
import pandas as pd


def test_first_decile_tax_rate_reasonable(baseline):
"""Test that first decile effective tax rate is below 175%.

The first decile by net income includes households with very low market
income (retirees, students, unemployed), so even reasonable taxes can
result in high effective rates when divided by low market income.

Without fix: 224% tax rate (pathological - all households charged SDLT)
With fix: ~147% (acceptable given low market income in D1)

Threshold of 175% catches pathological cases while allowing for the
inherent high ratio in low-income deciles.
"""
household_weight = baseline.calculate("household_weight", 2025).values
net_income = baseline.calculate("household_net_income", 2025).values
market_income = baseline.calculate("household_market_income", 2025).values
household_tax = baseline.calculate("household_tax", 2025).values

decile = pd.qcut(net_income, 10, labels=False, duplicates="drop")

d1_mask = decile == 0
d1_tax = (household_tax[d1_mask] * household_weight[d1_mask]).sum()
d1_market = (market_income[d1_mask] * household_weight[d1_mask]).sum()

if d1_market > 0:
d1_tax_rate = d1_tax / d1_market
assert d1_tax_rate < 1.75, (
f"First decile tax rate is {d1_tax_rate:.0%}, which exceeds 175%. "
f"Total D1 tax: £{d1_tax/1e9:.1f}bn, "
f"Total D1 market income: £{d1_market/1e9:.1f}bn. "
"This likely indicates a bug in property_purchased or similar variable."
)


def test_first_decile_average_tax_reasonable(baseline):
"""Test that first decile average household tax is reasonable.

Without fix: £90,988 average tax (unrealistic)
With fix: Should be below £50,000
"""
household_weight = baseline.calculate("household_weight", 2025).values
net_income = baseline.calculate("household_net_income", 2025).values
household_tax = baseline.calculate("household_tax", 2025).values

decile = pd.qcut(net_income, 10, labels=False, duplicates="drop")

d1_mask = decile == 0
d1_avg_tax = (
household_tax[d1_mask] * household_weight[d1_mask]
).sum() / household_weight[d1_mask].sum()

max_reasonable_d1_tax = 50_000

assert d1_avg_tax < max_reasonable_d1_tax, (
f"First decile average tax is £{d1_avg_tax:,.0f}, "
f"which exceeds £{max_reasonable_d1_tax:,}. "
"This likely indicates a bug in property_purchased or similar variable "
"causing unrealistic stamp duty charges."
)


def test_first_decile_net_income_not_severely_negative(baseline):
"""Test that first decile net income is not severely negative.

Without fix: -£37,452 average (due to massive SDLT)
With fix: Should be above -£10,000
"""
household_weight = baseline.calculate("household_weight", 2025).values
net_income = baseline.calculate("household_net_income", 2025).values

decile = pd.qcut(net_income, 10, labels=False, duplicates="drop")

d1_mask = decile == 0
d1_avg_net = (
net_income[d1_mask] * household_weight[d1_mask]
).sum() / household_weight[d1_mask].sum()

assert d1_avg_net > -10_000, (
f"First decile average net income is £{d1_avg_net:,.0f}, "
"which is significantly negative. This likely indicates a bug "
"in property_purchased or similar variable causing unrealistic "
"tax charges that push net income negative."
)


def test_decile_tax_ordering(baseline):
"""Test that higher deciles pay more tax than lower deciles.

Without fix: D1 (£90k) > D10 (£79k) - inverted!
With fix: D1 < D10 (correct ordering)
"""
household_weight = baseline.calculate("household_weight", 2025).values
net_income = baseline.calculate("household_net_income", 2025).values
household_tax = baseline.calculate("household_tax", 2025).values

decile = pd.qcut(net_income, 10, labels=False, duplicates="drop")

decile_taxes = []
for d in range(10):
mask = decile == d
avg_tax = (
household_tax[mask] * household_weight[mask]
).sum() / household_weight[mask].sum()
decile_taxes.append(avg_tax)

d1_tax = decile_taxes[0]
d10_tax = decile_taxes[9]

assert d1_tax < d10_tax, (
f"First decile tax (£{d1_tax:,.0f}) is higher than "
f"tenth decile tax (£{d10_tax:,.0f}). "
"This inverted pattern indicates a bug in tax calculations, "
"likely from property_purchased being incorrectly set."
)


def test_no_excessive_negative_incomes(baseline):
"""Test that excessive negative incomes are limited.

Without fix: 2.3% of households below -£50k
With fix: Should be below 1%
"""
household_weight = baseline.calculate("household_weight", 2025).values
net_income = baseline.calculate("household_net_income", 2025).values

total_households = household_weight.sum()
severe_negative_mask = net_income < -50_000
severe_negative_count = household_weight[severe_negative_mask].sum()
severe_negative_pct = severe_negative_count / total_households

max_allowed_pct = 0.01

assert severe_negative_pct < max_allowed_pct, (
f"{severe_negative_pct:.1%} of households have net income "
f"below -£50,000. This exceeds {max_allowed_pct:.0%} and indicates "
"a potential bug in tax calculations."
)
93 changes: 93 additions & 0 deletions policyengine_uk_data/tests/test_property_purchased.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""
Test that property_purchased is set correctly in the enhanced FRS dataset.

The property_purchased variable should be stochastically set based on
UK housing transaction rates (~3.85% of households per year).

Sources:
- Transactions: HMRC 2024 - 1.1m/year
https://www.gov.uk/government/statistics/monthly-property-transactions-completed-in-the-uk-with-value-40000-or-above
- Households: ONS 2024 - 28.6m
https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/families/bulletins/familiesandhouseholds/2024
- Rate: 1.1m / 28.6m = 3.85%

Verification against official SDLT revenue (2024-25):
- Official SDLT: £13.9bn (https://www.gov.uk/government/statistics/uk-stamp-tax-statistics)
- With fix (3.85%): ~£15bn (close to official)
- Without fix (100%): £370bn (26x too high)
"""

import pytest


PROPERTY_PURCHASE_RATE = 0.0385


def test_property_purchased_rate(baseline):
"""Test that property_purchased rate is approximately 3.85%."""
property_purchased = baseline.calculate("property_purchased", 2025).values

n_households = len(property_purchased)
true_count = property_purchased.sum()
actual_rate = true_count / n_households

# Rate should be approximately 3.85% (allow for random variation)
target_rate = PROPERTY_PURCHASE_RATE
tolerance = 0.02

assert (
abs(actual_rate - target_rate) < tolerance
), f"property_purchased rate {actual_rate:.2%} is not close to target {target_rate:.2%}"


def test_property_purchased_not_all_true(baseline):
"""Test that not all households have property_purchased = True."""
property_purchased = baseline.calculate("property_purchased", 2025).values

true_count = property_purchased.sum()
n_households = len(property_purchased)

# Should NOT be 100% True (the bug we fixed)
assert (
true_count < n_households * 0.1
), f"Too many households have property_purchased=True ({true_count}/{n_households})"


def test_property_purchased_has_some_true(baseline):
"""Test that some households have property_purchased = True."""
property_purchased = baseline.calculate("property_purchased", 2025).values

true_count = property_purchased.sum()

# Should have some True values (realistic purchasing rate)
assert true_count > 0, "No households have property_purchased=True"


def test_sdlt_total_reasonable(baseline):
"""Test that total SDLT revenue is realistic.

Official SDLT revenue (2024-25): £13.9bn
Source: https://www.gov.uk/government/statistics/uk-stamp-tax-statistics

Without fix (100% property_purchased=True): £370bn (26x too high)
With fix (3.85% rate): ~£15bn (close to official)
"""
expected_sdlt = baseline.calculate("expected_sdlt", 2025).values
household_weight = baseline.calculate("household_weight", 2025).values
total_sdlt = (expected_sdlt * household_weight).sum()

# Total SDLT should be within reasonable range of official figures
min_sdlt = 5e9 # £5bn minimum
max_sdlt = 50e9 # £50bn maximum (official is ~£14bn)

assert total_sdlt > min_sdlt, (
f"Total SDLT £{total_sdlt/1e9:.1f}bn is too low "
f"(minimum expected: £{min_sdlt/1e9:.1f}bn)"
)

assert total_sdlt < max_sdlt, (
f"Total SDLT £{total_sdlt/1e9:.1f}bn is unrealistically high "
f"(maximum expected: £{max_sdlt/1e9:.1f}bn). "
f"Official SDLT is ~£14bn. "
"This suggests property_purchased may be incorrectly set to True for all households."
)