PolicyEngine · nikhilwoodruff · Dec 30, 2025 · Dec 30, 2025 · Dec 30, 2025 · Dec 30, 2025
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: install dev format lint test integration-test clean seed up down logs start-supabase stop-supabase reset rebuild create-state-bucket deploy-local init db-reset-prod modal-deploy modal-serve docs
+.PHONY: install dev format lint test integration-test clean seed seed-full up down logs start-supabase stop-supabase rebuild create-state-bucket deploy-local init db-reset-local db-reseed-local db-reset-prod db-reseed-prod modal-deploy modal-serve docs
 
 # AWS Configuration
 AWS_REGION ?= us-east-1
@@ -25,8 +25,8 @@ integration-test:
 	@supabase start || true
 	@echo "2. Initialising database..."
 	@echo "yes" | uv run python scripts/init.py
-	@echo "3. Running seed script..."
-	@uv run python scripts/seed.py
+	@echo "3. Running seed script (lite mode)..."
+	@uv run python scripts/seed.py --lite
 	@echo "4. Running integration tests..."
 	@pytest tests/test_integration.py -v --tb=short
 	@echo "✓ Integration tests complete!"
@@ -40,9 +40,18 @@ clean:
 	find . -type f -name "*.pyc" -delete
 	find . -type d -name "*.egg-info" -exec rm -rf {} + 2>/dev/null || true
 
-reset:
-	@echo "Resetting Supabase database..."
-	supabase db reset
+db-reset-local:
+	@echo "Resetting and reseeding LOCAL database..."
+	@echo "1. Initialising database (drops and recreates tables)..."
+	@echo "yes" | uv run python scripts/init.py
+	@echo "2. Seeding data (lite mode)..."
+	@uv run python scripts/seed.py --lite
+	@echo "✓ Local database reset and seeded!"
+
+db-reseed-local:
+	@echo "Reseeding LOCAL database (lite mode, keeps existing tables)..."
+	@uv run python scripts/seed.py --lite
+	@echo "✓ Local database reseeded!"
 
 rebuild:
 	@echo "Rebuilding Docker containers..."
@@ -52,7 +61,11 @@ rebuild:
 	@echo "✓ Rebuild complete!"
 
 seed:
-	@echo "Seeding database with UK and US models..."
+	@echo "Seeding database with UK and US models (lite mode)..."
+	uv run python scripts/seed.py --lite
+
+seed-full:
+	@echo "Seeding database with UK and US models (full)..."
 	uv run python scripts/seed.py
 
 start-supabase:
@@ -113,6 +126,22 @@ db-reset-prod:
 		exit 1; \
 	fi
 
+db-reseed-prod:
+	@echo "⚠️  WARNING: This will reseed the PRODUCTION database ⚠️"
+	@echo "This will add/update models, parameters, and datasets."
+	@echo "Existing data will be preserved where possible."
+	@echo ""
+	@read -p "Are you sure you want to continue? Type 'yes' to confirm: " -r CONFIRM; \
+	echo; \
+	if [ "$$CONFIRM" = "yes" ]; then \
+		echo "Reseeding production database..."; \
+		set -a && . .env.prod && set +a && \
+		uv run python scripts/seed.py; \
+	else \
+		echo "Aborted."; \
+		exit 1; \
+	fi
+
 modal-deploy:
 	@echo "Deploying Modal functions..."
 	@set -a && . .env.prod && set +a && \

diff --git a/docs/AGENT_TESTING.md b/docs/AGENT_TESTING.md
@@ -0,0 +1,165 @@
+# Agent testing and optimisation
+
+This document tracks ongoing work to test and improve the PolicyEngine agent's ability to answer policy questions efficiently.
+
+## Goal
+
+Minimise the number of turns the agent needs to answer policy questions by improving API metadata, documentation, and structure - not by hacking for specific test cases.
+
+## Test categories
+
+We want comprehensive coverage across:
+- **Country**: UK and US
+- **Scope**: Household (single family) and Economy (population-wide)
+- **Complexity**: Simple (single variable lookup) to Complex (multi-step reforms)
+
+## Example questions to test
+
+### UK Household (simple)
+- "What is my income tax if I earn £50,000?"
+- "How much child benefit would a family with 2 children receive?"
+
+### UK Household (complex)
+- "Compare my net income under current law vs if the basic rate was 25%"
+- "What's the marginal tax rate for someone earning £100,000?"
+
+### UK Economy (simple)
+- "What's the total cost of child benefit?"
+- "How many people pay higher rate tax?"
+
+### UK Economy (complex)
+- "What would be the budgetary impact of raising the personal allowance to £15,000?"
+- "How would a £500 UBI affect poverty rates?"
+
+### US Household (simple)
+- "What is my federal income tax if I earn $75,000?"
+- "How much SNAP would a family of 4 with $30,000 income receive?"
+
+### US Household (complex)
+- "Compare my benefits under current law vs doubling the EITC"
+- "What's my marginal tax rate including state taxes in California?"
+
+### US Economy (simple)
+- "What's the total cost of SNAP?"
+- "How many households receive the EITC?"
+
+### US Economy (complex)
+- "What would be the budgetary impact of expanding the Child Tax Credit to $3,600?"
+- "How would eliminating the SALT cap affect different income deciles?"
+
+## Current agent architecture
+
+The agent uses Claude Code in a Modal sandbox with:
+- System prompt containing API documentation (see `src/policyengine_api/prompts/`)
+- Direct HTTP calls via curl to the PolicyEngine API
+- No MCP (it was causing issues in Modal containers)
+
+## Optimisation strategies
+
+1. **Improve system prompt** - Make API usage clearer, provide more examples
+2. **Add API response examples** - Show what successful responses look like
+3. **Parameter documentation** - Ensure all parameters are well-documented with valid values
+4. **Error messages** - Make error messages actionable so agent can self-correct
+5. **Endpoint discoverability** - Help agent find the right endpoint quickly
+
+## Test file location
+
+Tests are in `tests/test_agent_policy_questions.py` (integration tests requiring Modal).
+
+## How to continue this work
+
+1. Run existing tests: `pytest tests/test_agent_policy_questions.py -v -s`
+2. Check agent logs in Logfire for turn counts and errors
+3. Identify common failure patterns
+4. Improve prompts/metadata to address failures
+5. Add new test cases as coverage expands
+
+## Observed issues
+
+### Issue 1: Parameter search doesn't filter by country (9 turns for personal allowance)
+
+**Problem**: When searching for "personal allowance", the agent gets US results (Illinois AABD) mixed with UK results. It took 9 turns to find the UK personal allowance.
+
+**Agent's failed searches**:
+1. "personal allowance" → Illinois AABD (US)
+2. "income tax personal allowance" → empty
+3. "income_tax" → US CBO parameters
+4. "basic rate" → UK CGT (closer!)
+5. "allowance" → California SSI (US)
+6. "hmrc income_tax allowances personal" → empty
+7. "hmrc.income_tax.allowances" → found it!
+
+**Solution implemented**:
+- Added `tax_benefit_model_name` filter to `/parameters/` endpoint
+- Updated system prompt to instruct agent to use country filter
+
+**NOT acceptable solutions** (test hacking):
+- Adding specific parameter name examples to system prompt
+- Telling agent exactly what to search for
+
+### Issue 2: Duplicate parameters in database
+
+**Problem**: Same parameter name exists with multiple IDs. One has values, one doesn't. Agent picks wrong one first.
+
+**Example**: `gov.hmrc.income_tax.allowances.personal_allowance.amount` has two entries with different IDs.
+
+**Solution implemented**: Deduplicate parameters by name in seed script (`seen_names` set).
+
+### Issue 6: Case-sensitive search
+
+**Problem**: Search for "personal allowance" didn't find "Personal allowance" (capital P).
+
+**Solution implemented**: Changed search to use `ILIKE` instead of `contains` for case-insensitive matching.
+
+### Issue 7: Model name mismatch
+
+**Problem**: System prompt said `policyengine_uk` but database has `policyengine-uk` (hyphen vs underscore).
+
+**Solution implemented**: Updated system prompt and API docstrings to use correct model names with hyphens.
+
+### Issue 3: Variables endpoint lacks search
+
+**Problem**: `/variables/` had no search or country filter. Agent can't discover variable names.
+
+**Solution implemented**: Added `search` and `tax_benefit_model_name` filters to `/variables/`.
+
+### Issue 4: Datasets endpoint lacks country filter
+
+**Problem**: `/datasets/` returned all datasets, mixing UK and US.
+
+**Solution implemented**: Added `tax_benefit_model_name` filter to `/datasets/`.
+
+### Issue 5: Parameter values lack "current" filter
+
+**Problem**: Agent had to parse through all historical values to find current one.
+
+**Solution implemented**: Added `current=true` filter to `/parameter-values/` endpoint.
+
+## API improvements summary
+
+| Endpoint | Improvement |
+|----------|-------------|
+| `/parameters/` | Added `tax_benefit_model_name` filter, case-insensitive search |
+| `/variables/` | Added `search` and `tax_benefit_model_name` filters, case-insensitive search |
+| `/datasets/` | Added `tax_benefit_model_name` filter |
+| `/parameter-values/` | Added `current` filter |
+| Seed script | Deduplicate parameters by name |
+| System prompt | Fixed model names (hyphen not underscore) |
+
+## Measurements
+
+| Question type | Baseline | After improvements | Target |
+|---------------|----------|-------------------|--------|
+| Parameter lookup (UK personal allowance) | 10 turns | **3 turns** | 3-4 |
+| Household calculation (UK £50k income) | 6 turns | - | 5-6 |
+
+## Progress log
+
+- 2024-12-30: Initial setup, created test framework and first batch of questions
+- 2024-12-30: Tested personal allowance lookup - 9-10 turns (target: 3-4). Root cause: no country filter on parameter search
+- 2024-12-30: Added `tax_benefit_model_name` filter to `/parameters/`, `/variables/`, `/datasets/`
+- 2024-12-30: Tested household calc - 6 turns (acceptable). Async polling is the overhead
+- 2024-12-30: Discovered duplicate parameters in DB causing extra turns
+- 2024-12-30: Fixed model name mismatch (policyengine-uk with hyphen, not underscore)
+- 2024-12-30: Added case-insensitive search using ILIKE
+- 2024-12-30: Tested personal allowance lookup - **3 turns** (target met!)
diff --git a/scripts/seed.py b/scripts/seed.py
@@ -1,5 +1,6 @@
 """Seed database with UK and US models, variables, parameters, datasets."""
 
+import argparse
 import json
 import logging
 import math
@@ -101,7 +102,7 @@ def bulk_insert(session, table: str, columns: list[str], rows: list[dict]):
     session.commit()
 
 
-def seed_model(model_version, session) -> TaxBenefitModelVersion:
+def seed_model(model_version, session, lite: bool = False) -> TaxBenefitModelVersion:
     """Seed a tax-benefit model with its variables and parameters."""
 
     with logfire.span(
@@ -205,12 +206,27 @@ def seed_model(model_version, session) -> TaxBenefitModelVersion:
                 f"  [green]✓[/green] Added {len(model_version.variables)} variables"
             )
 
-        # Add parameters (only user-facing ones: those with labels or gov.* params)
-        parameters_to_add = [p for p in model_version.parameters if p.label is not None]
-        console.print(
-            f"  Filtered to {len(parameters_to_add)} user-facing parameters "
-            f"(from {len(model_version.parameters)} total)"
-        )
+        # Add parameters (only user-facing ones: those with labels)
+        # Deduplicate by name - keep first occurrence
+        # In lite mode, exclude US state parameters (gov.states.*)
+        seen_names = set()
+        parameters_to_add = []
+        skipped_state_params = 0
+        for p in model_version.parameters:
+            if p.label is None or p.name in seen_names:
+                continue
+            # In lite mode, skip state-level parameters for faster seeding
+            if lite and p.name.startswith("gov.states."):
+                skipped_state_params += 1
+                continue
+            parameters_to_add.append(p)
+            seen_names.add(p.name)
+
+        filter_msg = f"  Filtered to {len(parameters_to_add)} user-facing parameters"
+        filter_msg += f" (from {len(model_version.parameters)} total, deduplicated by name)"
+        if lite and skipped_state_params > 0:
+            filter_msg += f", skipped {skipped_state_params} state params (lite mode)"
+        console.print(filter_msg)
 
         with logfire.span("add_parameters", count=len(parameters_to_add)):
             # Build list of parameter dicts for bulk insert
@@ -574,16 +590,25 @@ def seed_example_policies(session):
 
 def main():
     """Main seed function."""
+    parser = argparse.ArgumentParser(description="Seed PolicyEngine database")
+    parser.add_argument(
+        "--lite",
+        action="store_true",
+        help="Lite mode: skip US state parameters for faster local seeding",
+    )
+    args = parser.parse_args()
+
     with logfire.span("database_seeding"):
-        console.print("[bold green]PolicyEngine database seeding[/bold green]\n")
+        mode_str = " (lite mode)" if args.lite else ""
+        console.print(f"[bold green]PolicyEngine database seeding{mode_str}[/bold green]\n")
 
         with next(get_quiet_session()) as session:
             # Seed UK model
-            uk_version = seed_model(uk_latest, session)
+            uk_version = seed_model(uk_latest, session, lite=args.lite)
             console.print(f"[green]✓[/green] UK model seeded: {uk_version.id}\n")
 
             # Seed US model
-            us_version = seed_model(us_latest, session)
+            us_version = seed_model(us_latest, session, lite=args.lite)
             console.print(f"[green]✓[/green] US model seeded: {us_version.id}\n")
 
             # Seed datasets

diff --git a/src/policyengine_api/agent_sandbox.py b/src/policyengine_api/agent_sandbox.py
@@ -21,14 +21,18 @@
 You have access to the full PolicyEngine API. Key workflows:
 
 1. **Household calculations**: POST to /household/calculate with people array, then poll GET /household/calculate/{job_id}
-2. **Parameter lookup**: GET /parameters/ with search query, then GET /parameter-values/ with parameter_id
+2. **Parameter lookup**: GET /parameters/ with search query and tax_benefit_model_name, then GET /parameter-values/ with parameter_id
 3. **Economic impact**:
    - GET /parameters/ to find parameter_id
    - POST /policies/ to create reform with parameter_values
    - GET /datasets/ to find dataset_id
    - POST /analysis/economic-impact with policy_id and dataset_id
    - Poll GET /analysis/economic-impact/{report_id} until completed
 
+When searching for parameters, use tax_benefit_model_name to filter by country:
+- "policyengine-uk" for UK parameters
+- "policyengine-us" for US parameters
+
 When answering questions:
 1. Use the API tools to get accurate, current data
 2. Show your calculations clearly

diff --git a/src/policyengine_api/api/datasets.py b/src/policyengine_api/api/datasets.py
@@ -11,24 +11,35 @@
 from fastapi import APIRouter, Depends, HTTPException
 from sqlmodel import Session, select
 
-from policyengine_api.models import Dataset, DatasetRead
+from policyengine_api.models import Dataset, DatasetRead, TaxBenefitModel
 from policyengine_api.services.database import get_session
 
 router = APIRouter(prefix="/datasets", tags=["datasets"])
 
 
 @router.get("/", response_model=List[DatasetRead])
-def list_datasets(session: Session = Depends(get_session)):
-    """List all available datasets.
+def list_datasets(
+    tax_benefit_model_name: str | None = None,
+    session: Session = Depends(get_session),
+):
+    """List available datasets.
 
     Returns datasets that can be used with the /analysis/economic-impact endpoint.
     Each dataset represents population microdata for a specific country and year.
 
-    USAGE: For UK analysis, look for datasets with names containing "uk" or "frs".
-    For US analysis, look for datasets with names containing "us" or "cps".
-    Use the dataset's id when calling /analysis/economic-impact.
+    Args:
+        tax_benefit_model_name: Filter by country model.
+            Use "policyengine-uk" for UK datasets.
+            Use "policyengine-us" for US datasets.
     """
-    datasets = session.exec(select(Dataset)).all()
+    query = select(Dataset)
+
+    if tax_benefit_model_name:
+        query = query.join(TaxBenefitModel).where(
+            TaxBenefitModel.name == tax_benefit_model_name
+        )
+
+    datasets = session.exec(query).all()
     return datasets