diff --git a/.beads/.gitignore b/.beads/.gitignore new file mode 100644 index 00000000..f438450f --- /dev/null +++ b/.beads/.gitignore @@ -0,0 +1,29 @@ +# SQLite databases +*.db +*.db?* +*.db-journal +*.db-wal +*.db-shm + +# Daemon runtime files +daemon.lock +daemon.log +daemon.pid +bd.sock + +# Legacy database files +db.sqlite +bd.db + +# Merge artifacts (temporary files from 3-way merge) +beads.base.jsonl +beads.base.meta.json +beads.left.jsonl +beads.left.meta.json +beads.right.jsonl +beads.right.meta.json + +# Keep JSONL exports and config (source of truth for git) +!issues.jsonl +!metadata.json +!config.json diff --git a/.beads/README.md b/.beads/README.md new file mode 100644 index 00000000..8d603245 --- /dev/null +++ b/.beads/README.md @@ -0,0 +1,81 @@ +# Beads - AI-Native Issue Tracking + +Welcome to Beads! This repository uses **Beads** for issue tracking - a modern, AI-native tool designed to live directly in your codebase alongside your code. + +## What is Beads? + +Beads is issue tracking that lives in your repo, making it perfect for AI coding agents and developers who want their issues close to their code. No web UI required - everything works through the CLI and integrates seamlessly with git. + +**Learn more:** [github.com/steveyegge/beads](https://github.com/steveyegge/beads) + +## Quick Start + +### Essential Commands + +```bash +# Create new issues +bd create "Add user authentication" + +# View all issues +bd list + +# View issue details +bd show + +# Update issue status +bd update --status in-progress +bd update --status done + +# Sync with git remote +bd sync +``` + +### Working with Issues + +Issues in Beads are: +- **Git-native**: Stored in `.beads/issues.jsonl` and synced like code +- **AI-friendly**: CLI-first design works perfectly with AI coding agents +- **Branch-aware**: Issues can follow your branch workflow +- **Always in sync**: Auto-syncs with your commits + +## Why Beads? + +✨ **AI-Native Design** +- Built specifically for AI-assisted development workflows +- CLI-first interface works seamlessly with AI coding agents +- No context switching to web UIs + +🚀 **Developer Focused** +- Issues live in your repo, right next to your code +- Works offline, syncs when you push +- Fast, lightweight, and stays out of your way + +🔧 **Git Integration** +- Automatic sync with git commits +- Branch-aware issue tracking +- Intelligent JSONL merge resolution + +## Get Started with Beads + +Try Beads in your own projects: + +```bash +# Install Beads +curl -sSL https://raw.githubusercontent.com/steveyegge/beads/main/scripts/install.sh | bash + +# Initialize in your repo +bd init + +# Create your first issue +bd create "Try out Beads" +``` + +## Learn More + +- **Documentation**: [github.com/steveyegge/beads/docs](https://github.com/steveyegge/beads/tree/main/docs) +- **Quick Start Guide**: Run `bd quickstart` +- **Examples**: [github.com/steveyegge/beads/examples](https://github.com/steveyegge/beads/tree/main/examples) + +--- + +*Beads: Issue tracking that moves at the speed of thought* ⚡ diff --git a/.beads/config.yaml b/.beads/config.yaml new file mode 100644 index 00000000..95c5f3e7 --- /dev/null +++ b/.beads/config.yaml @@ -0,0 +1,56 @@ +# Beads Configuration File +# This file configures default behavior for all bd commands in this repository +# All settings can also be set via environment variables (BD_* prefix) +# or overridden with command-line flags + +# Issue prefix for this repository (used by bd init) +# If not set, bd init will auto-detect from directory name +# Example: issue-prefix: "myproject" creates issues like "myproject-1", "myproject-2", etc. +# issue-prefix: "" + +# Use no-db mode: load from JSONL, no SQLite, write back after each command +# When true, bd will use .beads/issues.jsonl as the source of truth +# instead of SQLite database +# no-db: false + +# Disable daemon for RPC communication (forces direct database access) +# no-daemon: false + +# Disable auto-flush of database to JSONL after mutations +# no-auto-flush: false + +# Disable auto-import from JSONL when it's newer than database +# no-auto-import: false + +# Enable JSON output by default +# json: false + +# Default actor for audit trails (overridden by BD_ACTOR or --actor) +# actor: "" + +# Path to database (overridden by BEADS_DB or --db) +# db: "" + +# Auto-start daemon if not running (can also use BEADS_AUTO_START_DAEMON) +# auto-start-daemon: true + +# Debounce interval for auto-flush (can also use BEADS_FLUSH_DEBOUNCE) +# flush-debounce: "5s" + +# Multi-repo configuration (experimental - bd-307) +# Allows hydrating from multiple repositories and routing writes to the correct JSONL +# repos: +# primary: "." # Primary repo (where this database lives) +# additional: # Additional repos to hydrate from (read-only) +# - ~/beads-planning # Personal planning repo +# - ~/work-planning # Work planning repo + +# Integration settings (access with 'bd config get/set') +# These are stored in the database, not in this file: +# - jira.url +# - jira.project +# - linear.url +# - linear.api-key +# - github.org +# - github.repo +# - sync.branch - Git branch for beads commits (use BEADS_SYNC_BRANCH env var or bd config set) diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl new file mode 100644 index 00000000..252ee9c2 --- /dev/null +++ b/.beads/issues.jsonl @@ -0,0 +1 @@ +{"id":"policyengine-uk-data-hpf","title":"Add num_vehicles as predictor to fuel imputation","description":"","status":"closed","priority":2,"issue_type":"feature","created_at":"2025-12-03T13:36:37.831279-05:00","updated_at":"2025-12-03T13:44:03.075293-05:00","closed_at":"2025-12-03T13:44:03.075293-05:00"} diff --git a/.beads/metadata.json b/.beads/metadata.json new file mode 100644 index 00000000..4faf148a --- /dev/null +++ b/.beads/metadata.json @@ -0,0 +1,5 @@ +{ + "database": "beads.db", + "jsonl_export": "issues.jsonl", + "last_bd_version": "0.26.0" +} \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..807d5983 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ + +# Use bd merge for beads JSONL files +.beads/issues.jsonl merge=beads diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..201e3f99 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Use num_vehicles as predictor for fuel spending imputation diff --git a/docs/imputations.md b/docs/imputations.md new file mode 100644 index 00000000..63c15df4 --- /dev/null +++ b/docs/imputations.md @@ -0,0 +1,223 @@ +# Imputations + +PolicyEngine UK Data enhances the Family Resources Survey with variables from other surveys using statistical imputation. All imputations use **Quantile Regression Forests (QRF)**, which predict the full conditional distribution of target variables given predictor variables. + +## Imputation Pipeline Order + +The imputations are applied in this order (dependencies noted): + +1. **Wealth** (from WAS) +2. **Consumption** (from LCFS) — requires `num_vehicles` from wealth +3. **VAT** (from ETB) +4. **Public Services** (from ETB) +5. **Income** (from SPI) +6. **Capital Gains** (from Advani-Summers data) +7. **Salary Sacrifice** (from FRS subsample) +8. **Student Loan Plan** (rule-based, from age) + +--- + +## Wealth Imputation + +**Source:** Wealth and Assets Survey (WAS) Round 7 (2018-2020) + +Imputes household wealth components using demographic and income predictors. + +### Predictors +| Variable | Description | +|----------|-------------| +| `household_net_income` | Total household income after taxes | +| `num_adults` | Number of adults in household | +| `num_children` | Number of children in household | +| `private_pension_income` | Income from private pensions | +| `employment_income` | Income from employment | +| `self_employment_income` | Income from self-employment | +| `capital_income` | Income from capital/investments | +| `num_bedrooms` | Number of bedrooms in dwelling | +| `council_tax` | Annual council tax payment | +| `is_renting` | Whether household rents (vs owns) | +| `region` | UK region | + +### Outputs +| Variable | Description | +|----------|-------------| +| `owned_land` | Value of owned land | +| `property_wealth` | Total property wealth | +| `corporate_wealth` | Shares, pensions, investment ISAs | +| `gross_financial_wealth` | Total financial assets | +| `net_financial_wealth` | Financial assets minus liabilities | +| `main_residence_value` | Value of main home | +| `other_residential_property_value` | Value of other properties | +| `non_residential_property_value` | Value of non-residential property | +| `savings` | Savings account balances | +| `num_vehicles` | Number of vehicles owned | + +--- + +## Consumption Imputation + +**Source:** Living Costs and Food Survey (LCFS) 2021-22 + +Imputes household spending patterns for indirect tax modeling. + +### Predictors +| Variable | Description | +|----------|-------------| +| `is_adult` | Number of adults | +| `is_child` | Number of children | +| `region` | UK region | +| `employment_income` | Employment income | +| `self_employment_income` | Self-employment income | +| `private_pension_income` | Private pension income | +| `household_net_income` | Total household income | +| `has_fuel_consumption` | Whether household buys petrol/diesel (from WAS) | + +### Outputs +| Variable | Description | +|----------|-------------| +| `food_and_non_alcoholic_beverages_consumption` | Food spending | +| `alcohol_and_tobacco_consumption` | Alcohol/tobacco spending | +| `clothing_and_footwear_consumption` | Clothing spending | +| `housing_water_and_electricity_consumption` | Housing costs | +| `household_furnishings_consumption` | Furnishings spending | +| `health_consumption` | Health spending | +| `transport_consumption` | Transport spending | +| `communication_consumption` | Communication spending | +| `recreation_consumption` | Recreation spending | +| `education_consumption` | Education spending | +| `restaurants_and_hotels_consumption` | Restaurants/hotels spending | +| `miscellaneous_consumption` | Other spending | +| `petrol_spending` | Petrol fuel spending | +| `diesel_spending` | Diesel fuel spending | +| `domestic_energy_consumption` | Home energy spending | + +### Bridging WAS Vehicle Ownership to LCFS Fuel Spending + +LCFS 2-week diaries undercount fuel purchasers (58%) compared to actual vehicle ownership (78% per NTS 2024). We bridge this gap using WAS vehicle data: + +1. **In WAS**: Create `has_fuel_consumption` from vehicle ownership: + - `has_fuel = (num_vehicles > 0) AND (random < 0.90)` + - The 90% accounts for EVs/PHEVs that don't buy petrol/diesel + - Source: NTS 2024 shows 59% petrol + 30% diesel + ~1% hybrid fuel use + +2. **Train QRF**: Predict `has_fuel_consumption` from demographics (income, adults, children, region) + +3. **Apply to LCFS**: Impute `has_fuel_consumption` to LCFS households before training consumption model + +4. **At FRS imputation time**: Compute `has_fuel_consumption` directly from `num_vehicles` (already calibrated to NTS targets) + +This ensures fuel duty incidence aligns with actual vehicle ownership (~70% of households = 78% vehicles × 90% ICE) rather than LCFS diary randomness. + +--- + +## VAT Imputation + +**Source:** Effects of Taxes and Benefits (ETB) 1977-2021 + +Imputes the share of household spending subject to full-rate VAT. + +### Predictors +| Variable | Description | +|----------|-------------| +| `is_adult` | Number of adults | +| `is_child` | Number of children | +| `is_SP_age` | Number at State Pension age | +| `household_net_income` | Total household income | + +### Outputs +| Variable | Description | +|----------|-------------| +| `full_rate_vat_expenditure_rate` | Share of spending at 20% VAT | + +--- + +## Income Imputation + +**Source:** Survey of Personal Incomes (SPI) 2020-21 + +Imputes detailed income components to create "synthetic taxpayers" with higher incomes than typically captured in the FRS. These records initially have zero weight but can be upweighted during calibration to match HMRC income distribution targets. + +### Predictors +| Variable | Description | +|----------|-------------| +| `age` | Person's age | +| `gender` | Male/Female | +| `region` | UK region | + +### Outputs +| Variable | Description | +|----------|-------------| +| `employment_income` | Income from employment | +| `self_employment_income` | Income from self-employment | +| `savings_interest_income` | Interest on savings | +| `dividend_income` | Dividend income | +| `private_pension_income` | Private pension income | +| `property_income` | Rental/property income | + +--- + +## Capital Gains Imputation + +**Source:** Advani-Summers capital gains distribution data + +Uses a gradient-based optimization approach rather than QRF. The dataset is doubled, with one half receiving imputed capital gains amounts. Weights are then optimized to match the empirical relationship between total income and capital gains incidence. + +### Method +1. Double the dataset (original + clone) +2. Assign capital gains to one adult per household in the cloned half +3. Optimize blend weights to match income-band capital gains incidence from Advani-Summers data + +--- + +## Salary Sacrifice Imputation + +**Source:** FRS 2023-24 (respondents asked about salary sacrifice) + +Imputes pension contributions made via salary sacrifice arrangements. + +### Predictors +| Variable | Description | +|----------|-------------| +| `age` | Person's age | +| `employment_income` | Employment income | + +### Outputs +| Variable | Description | +|----------|-------------| +| `pension_contributions_via_salary_sacrifice` | Annual SS pension contributions | + +### Training Data +- FRS respondents with `SALSAC='1'` (Yes): ~224 jobs with reported amounts +- FRS respondents with `SALSAC='2'` (No): ~3,803 jobs with 0 +- Imputation candidates (`SALSAC=' '`): ~13,265 jobs + +--- + +## Student Loan Plan Imputation + +**Source:** Rule-based (not QRF) + +Assigns student loan plan type based on age and reported repayments. + +### Logic +1. If `student_loan_repayments > 0`, person has a loan +2. Estimate university start year = `simulation_year - age + 18` +3. Assign plan: + - **Plan 1**: Started before September 2012 + - **Plan 2**: Started September 2012 - August 2023 + - **Plan 5**: Started September 2023 onwards + +--- + +## Calibration Targets + +After imputation, household weights are calibrated to match aggregate statistics from: + +| Source | Targets | +|--------|---------| +| **OBR** | Tax revenues, benefit expenditures (20 programs) | +| **ONS** | Age/region populations, family types, tenure | +| **HMRC** | Income distributions by band (7 income types × 14 bands) | +| **DWP** | Universal Credit statistics, two-child limit | +| **NTS** | Vehicle ownership (22% none, 44% one, 34% two+) | +| **Council Tax** | Households by council tax band | diff --git a/docs/myst.yml b/docs/myst.yml index 165211b4..29f21f0a 100644 --- a/docs/myst.yml +++ b/docs/myst.yml @@ -10,6 +10,7 @@ project: toc: - file: intro.md - file: methodology.ipynb + - file: imputations.md - file: validation/index.md children: - file: validation/national.ipynb diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index 9e605e29..044c61a3 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -62,14 +62,16 @@ def main(): ) # Apply imputations with progress tracking - update_dataset("Impute consumption", "processing") - frs = impute_consumption(frs) - update_dataset("Impute consumption", "completed") - + # Wealth must be imputed before consumption because consumption + # uses num_vehicles as a predictor for fuel spending update_dataset("Impute wealth", "processing") frs = impute_wealth(frs) update_dataset("Impute wealth", "completed") + update_dataset("Impute consumption", "processing") + frs = impute_consumption(frs) + update_dataset("Impute consumption", "completed") + update_dataset("Impute VAT", "processing") frs = impute_vat(frs) update_dataset("Impute VAT", "completed") diff --git a/policyengine_uk_data/datasets/imputations/consumption.py b/policyengine_uk_data/datasets/imputations/consumption.py index e42614c9..49d4127c 100644 --- a/policyengine_uk_data/datasets/imputations/consumption.py +++ b/policyengine_uk_data/datasets/imputations/consumption.py @@ -1,3 +1,16 @@ +""" +Consumption imputation using Living Costs and Food Survey data. + +This module imputes household consumption patterns (including fuel spending) +using QRF models trained on LCFS data, with vehicle ownership information +from the Wealth and Assets Survey to improve fuel spending predictions. + +Key innovation: We impute `has_fuel_consumption` to WAS based on vehicle +ownership, then use this to bridge WAS and LCFS for fuel spending imputation. +This addresses the issue that LCFS 2-week diaries undercount fuel purchases +(58% have any fuel) vs actual vehicle ownership (78% per NTS 2024). +""" + import pandas as pd from pathlib import Path import numpy as np @@ -9,6 +22,14 @@ LCFS_TAB_FOLDER = STORAGE_FOLDER / "lcfs_2021_22" +# EV/ICE vehicle mix from NTS 2024 +# Source: https://www.gov.uk/government/statistics/national-travel-survey-2024 +# "Around 59% of cars people owned were petrol, 30% were diesel, 6% hybrid, +# 4% battery electric and 2% plug-in hybrid." +# ICE share = 59% + 30% = 89%, plus hybrids still use some fuel +# We use 90% as the probability a vehicle owner buys petrol/diesel +NTS_2024_ICE_VEHICLE_SHARE = 0.90 + REGIONS = { 1: "NORTH_EAST", 2: "NORTH_WEST", @@ -65,6 +86,7 @@ "self_employment_income", "private_pension_income", "household_net_income", + "has_fuel_consumption", # Imputed from WAS vehicle ownership ] IMPUTATIONS = [ @@ -86,9 +108,127 @@ ] +def create_has_fuel_model(): + """ + Train a model to predict has_fuel_consumption from demographics. + + Uses WAS vehicle ownership to create has_fuel_consumption: + - Households with vehicles have ~90% chance of fuel consumption (ICE vehicles) + - Households without vehicles have ~0% chance + + This bridges the gap between: + - LCFS: 58% of households recorded fuel in 2-week diary + - NTS 2024: 78% of households have vehicles + + Sources: + - NTS 2024 vehicle ownership: https://www.gov.uk/government/statistics/ + national-travel-survey-2024/nts-2024-household-car-availability-and-trends + "22% of households had no vehicle, 44% one vehicle, 34% two or more" + - NTS 2024 fuel type: "59% petrol, 30% diesel, 6% hybrid, 4% BEV, 2% PHEV" + So ~90% of vehicle owners use petrol/diesel (ICE + hybrids) + + Returns: + QRF model predicting has_fuel_consumption from demographics. + """ + from policyengine_uk_data.utils.qrf import QRF + from policyengine_uk_data.datasets.imputations.wealth import ( + WAS_TAB_FOLDER, + REGIONS, + ) + + model_path = STORAGE_FOLDER / "has_fuel_model.pkl" + if model_path.exists(): + return QRF(file_path=model_path) + + # Load WAS with vehicle ownership + was = pd.read_csv( + WAS_TAB_FOLDER / "was_round_7_hhold_eul_march_2022.tab", + sep="\t", + low_memory=False, + ) + was.columns = [c.lower() for c in was.columns] + + # Create has_fuel_consumption from vehicle ownership + # Vehicle owners have 90% chance (ICE vehicles), non-owners have 0% + num_vehicles = was["vcarnr7"].fillna(0).clip(lower=0) + has_vehicle = num_vehicles > 0 + + # Randomly assign fuel consumption based on ICE share + # This simulates that ~10% of vehicle owners have EVs/PHEVs + np.random.seed(42) # Reproducibility + is_ice_vehicle = np.random.random(len(was)) < NTS_2024_ICE_VEHICLE_SHARE + has_fuel = (has_vehicle & is_ice_vehicle).astype(float) + + # Build training DataFrame with predictors available in LCFS + was_df = pd.DataFrame( + { + "household_net_income": was["dvtotinc_bhcr7"], + "num_adults": was["numadultr7"], + "num_children": was["numch18r7"], + "private_pension_income": was["dvgippenr7_aggr"], + "employment_income": was["dvgiempr7_aggr"], + "self_employment_income": was["dvgiser7_aggr"], + "region": was["gorr7"].map(REGIONS), + "has_fuel_consumption": has_fuel, + } + ).dropna() + + predictors = [ + "household_net_income", + "num_adults", + "num_children", + "private_pension_income", + "employment_income", + "self_employment_income", + "region", + ] + + model = QRF() + model.fit(was_df[predictors], was_df[["has_fuel_consumption"]]) + model.save(model_path) + return model + + +def impute_has_fuel_to_lcfs(household: pd.DataFrame) -> pd.DataFrame: + """ + Impute has_fuel_consumption to LCFS households using WAS-trained model. + + This provides a consistent fuel consumption indicator based on vehicle + ownership patterns, rather than relying on the LCFS 2-week diary which + underestimates fuel purchasers (58% vs 78% vehicle ownership). + """ + model = create_has_fuel_model() + + input_df = pd.DataFrame( + { + "household_net_income": household["household_net_income"], + "num_adults": household["is_adult"], + "num_children": household["is_child"], + "private_pension_income": household["private_pension_income"], + "employment_income": household["employment_income"], + "self_employment_income": household["self_employment_income"], + "region": household["region"], + } + ) + + output_df = model.predict(input_df) + # Clip to [0, 1] as it's a probability + household["has_fuel_consumption"] = output_df[ + "has_fuel_consumption" + ].values.clip(0, 1) + + return household + + def generate_lcfs_table( lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame ): + """ + Generate LCFS training table for consumption imputation. + + Processes raw LCFS data and imputes has_fuel_consumption from WAS + vehicle ownership patterns to improve fuel spending predictions. + """ person = lcfs_person.rename(columns=PERSON_LCF_RENAMES) household = lcfs_household.rename(columns=HOUSEHOLD_LCF_RENAMES) household["region"] = household["region"].map(REGIONS) @@ -102,6 +242,11 @@ def generate_lcfs_table( person[variable].groupby(person.case).sum()[household.case] * 52 ) household.household_weight *= 1_000 + + # Impute has_fuel_consumption from WAS vehicle ownership model + # This bridges WAS (has vehicles) to LCFS (has fuel spending) + household = impute_has_fuel_to_lcfs(household) + return household[ PREDICTOR_VARIABLES + IMPUTATIONS + ["household_weight"] ].dropna() @@ -163,14 +308,36 @@ def create_consumption_model(overwrite_existing: bool = False): def impute_consumption(dataset: UKSingleYearDataset) -> UKSingleYearDataset: - # Impute wealth, assuming same time period as trained data + """ + Impute consumption variables using LCFS-trained model. + + Requires num_vehicles to be present in the dataset (from wealth imputation) + to compute has_fuel_consumption. + """ dataset = dataset.copy() - model = create_consumption_model() + # First, compute has_fuel_consumption from num_vehicles + # This uses the same logic as the WAS training data: + # - Vehicle owners have 90% chance of fuel consumption (ICE vehicles) + # - Non-owners have 0% chance sim = Microsimulation(dataset=dataset) + num_vehicles = sim.calculate("num_vehicles", map_to="household").values + + np.random.seed(42) # Match training data randomness + has_vehicle = num_vehicles > 0 + is_ice = np.random.random(len(num_vehicles)) < NTS_2024_ICE_VEHICLE_SHARE + has_fuel_consumption = (has_vehicle & is_ice).astype(float) + dataset.household["has_fuel_consumption"] = has_fuel_consumption + + # Now run the consumption model with has_fuel_consumption as predictor + model = create_consumption_model() predictors = model.input_columns - input_df = sim.calculate_dataframe(predictors, map_to="household") + input_df = sim.calculate_dataframe( + [p for p in predictors if p != "has_fuel_consumption"], + map_to="household", + ) + input_df["has_fuel_consumption"] = has_fuel_consumption output_df = model.predict(input_df) diff --git a/uv.lock b/uv.lock index 114aea48..0cb108e2 100644 --- a/uv.lock +++ b/uv.lock @@ -1409,7 +1409,7 @@ wheels = [ [[package]] name = "policyengine-uk-data" -version = "1.24.2" +version = "1.27.0" source = { editable = "." } dependencies = [ { name = "black" },