diff --git a/Makefile b/Makefile index b516274..9396a93 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: install dev format lint test integration-test clean seed up down logs start-supabase stop-supabase reset rebuild create-state-bucket deploy-local init db-reset-prod modal-deploy modal-serve docs +.PHONY: install dev format lint test integration-test clean seed seed-full up down logs start-supabase stop-supabase rebuild create-state-bucket deploy-local init db-reset-local db-reseed-local db-reset-prod db-reseed-prod modal-deploy modal-serve docs # AWS Configuration AWS_REGION ?= us-east-1 @@ -25,8 +25,8 @@ integration-test: @supabase start || true @echo "2. Initialising database..." @echo "yes" | uv run python scripts/init.py - @echo "3. Running seed script..." - @uv run python scripts/seed.py + @echo "3. Running seed script (lite mode)..." + @uv run python scripts/seed.py --lite @echo "4. Running integration tests..." @pytest tests/test_integration.py -v --tb=short @echo "✓ Integration tests complete!" @@ -40,9 +40,18 @@ clean: find . -type f -name "*.pyc" -delete find . -type d -name "*.egg-info" -exec rm -rf {} + 2>/dev/null || true -reset: - @echo "Resetting Supabase database..." - supabase db reset +db-reset-local: + @echo "Resetting and reseeding LOCAL database..." + @echo "1. Initialising database (drops and recreates tables)..." + @echo "yes" | uv run python scripts/init.py + @echo "2. Seeding data (lite mode)..." + @uv run python scripts/seed.py --lite + @echo "✓ Local database reset and seeded!" + +db-reseed-local: + @echo "Reseeding LOCAL database (lite mode, keeps existing tables)..." + @uv run python scripts/seed.py --lite + @echo "✓ Local database reseeded!" rebuild: @echo "Rebuilding Docker containers..." @@ -52,7 +61,11 @@ rebuild: @echo "✓ Rebuild complete!" seed: - @echo "Seeding database with UK and US models..." + @echo "Seeding database with UK and US models (lite mode)..." + uv run python scripts/seed.py --lite + +seed-full: + @echo "Seeding database with UK and US models (full)..." uv run python scripts/seed.py start-supabase: @@ -113,6 +126,22 @@ db-reset-prod: exit 1; \ fi +db-reseed-prod: + @echo "⚠️ WARNING: This will reseed the PRODUCTION database ⚠️" + @echo "This will add/update models, parameters, and datasets." + @echo "Existing data will be preserved where possible." + @echo "" + @read -p "Are you sure you want to continue? Type 'yes' to confirm: " -r CONFIRM; \ + echo; \ + if [ "$$CONFIRM" = "yes" ]; then \ + echo "Reseeding production database..."; \ + set -a && . .env.prod && set +a && \ + uv run python scripts/seed.py; \ + else \ + echo "Aborted."; \ + exit 1; \ + fi + modal-deploy: @echo "Deploying Modal functions..." @set -a && . .env.prod && set +a && \ diff --git a/docs/AGENT_TESTING.md b/docs/AGENT_TESTING.md new file mode 100644 index 0000000..cccb37c --- /dev/null +++ b/docs/AGENT_TESTING.md @@ -0,0 +1,165 @@ +# Agent testing and optimisation + +This document tracks ongoing work to test and improve the PolicyEngine agent's ability to answer policy questions efficiently. + +## Goal + +Minimise the number of turns the agent needs to answer policy questions by improving API metadata, documentation, and structure - not by hacking for specific test cases. + +## Test categories + +We want comprehensive coverage across: +- **Country**: UK and US +- **Scope**: Household (single family) and Economy (population-wide) +- **Complexity**: Simple (single variable lookup) to Complex (multi-step reforms) + +## Example questions to test + +### UK Household (simple) +- "What is my income tax if I earn £50,000?" +- "How much child benefit would a family with 2 children receive?" + +### UK Household (complex) +- "Compare my net income under current law vs if the basic rate was 25%" +- "What's the marginal tax rate for someone earning £100,000?" + +### UK Economy (simple) +- "What's the total cost of child benefit?" +- "How many people pay higher rate tax?" + +### UK Economy (complex) +- "What would be the budgetary impact of raising the personal allowance to £15,000?" +- "How would a £500 UBI affect poverty rates?" + +### US Household (simple) +- "What is my federal income tax if I earn $75,000?" +- "How much SNAP would a family of 4 with $30,000 income receive?" + +### US Household (complex) +- "Compare my benefits under current law vs doubling the EITC" +- "What's my marginal tax rate including state taxes in California?" + +### US Economy (simple) +- "What's the total cost of SNAP?" +- "How many households receive the EITC?" + +### US Economy (complex) +- "What would be the budgetary impact of expanding the Child Tax Credit to $3,600?" +- "How would eliminating the SALT cap affect different income deciles?" + +## Current agent architecture + +The agent uses Claude Code in a Modal sandbox with: +- System prompt containing API documentation (see `src/policyengine_api/prompts/`) +- Direct HTTP calls via curl to the PolicyEngine API +- No MCP (it was causing issues in Modal containers) + +## Optimisation strategies + +1. **Improve system prompt** - Make API usage clearer, provide more examples +2. **Add API response examples** - Show what successful responses look like +3. **Parameter documentation** - Ensure all parameters are well-documented with valid values +4. **Error messages** - Make error messages actionable so agent can self-correct +5. **Endpoint discoverability** - Help agent find the right endpoint quickly + +## Test file location + +Tests are in `tests/test_agent_policy_questions.py` (integration tests requiring Modal). + +## How to continue this work + +1. Run existing tests: `pytest tests/test_agent_policy_questions.py -v -s` +2. Check agent logs in Logfire for turn counts and errors +3. Identify common failure patterns +4. Improve prompts/metadata to address failures +5. Add new test cases as coverage expands + +## Observed issues + +### Issue 1: Parameter search doesn't filter by country (9 turns for personal allowance) + +**Problem**: When searching for "personal allowance", the agent gets US results (Illinois AABD) mixed with UK results. It took 9 turns to find the UK personal allowance. + +**Agent's failed searches**: +1. "personal allowance" → Illinois AABD (US) +2. "income tax personal allowance" → empty +3. "income_tax" → US CBO parameters +4. "basic rate" → UK CGT (closer!) +5. "allowance" → California SSI (US) +6. "hmrc income_tax allowances personal" → empty +7. "hmrc.income_tax.allowances" → found it! + +**Solution implemented**: +- Added `tax_benefit_model_name` filter to `/parameters/` endpoint +- Updated system prompt to instruct agent to use country filter + +**NOT acceptable solutions** (test hacking): +- Adding specific parameter name examples to system prompt +- Telling agent exactly what to search for + +### Issue 2: Duplicate parameters in database + +**Problem**: Same parameter name exists with multiple IDs. One has values, one doesn't. Agent picks wrong one first. + +**Example**: `gov.hmrc.income_tax.allowances.personal_allowance.amount` has two entries with different IDs. + +**Solution implemented**: Deduplicate parameters by name in seed script (`seen_names` set). + +### Issue 6: Case-sensitive search + +**Problem**: Search for "personal allowance" didn't find "Personal allowance" (capital P). + +**Solution implemented**: Changed search to use `ILIKE` instead of `contains` for case-insensitive matching. + +### Issue 7: Model name mismatch + +**Problem**: System prompt said `policyengine_uk` but database has `policyengine-uk` (hyphen vs underscore). + +**Solution implemented**: Updated system prompt and API docstrings to use correct model names with hyphens. + +### Issue 3: Variables endpoint lacks search + +**Problem**: `/variables/` had no search or country filter. Agent can't discover variable names. + +**Solution implemented**: Added `search` and `tax_benefit_model_name` filters to `/variables/`. + +### Issue 4: Datasets endpoint lacks country filter + +**Problem**: `/datasets/` returned all datasets, mixing UK and US. + +**Solution implemented**: Added `tax_benefit_model_name` filter to `/datasets/`. + +### Issue 5: Parameter values lack "current" filter + +**Problem**: Agent had to parse through all historical values to find current one. + +**Solution implemented**: Added `current=true` filter to `/parameter-values/` endpoint. + +## API improvements summary + +| Endpoint | Improvement | +|----------|-------------| +| `/parameters/` | Added `tax_benefit_model_name` filter, case-insensitive search | +| `/variables/` | Added `search` and `tax_benefit_model_name` filters, case-insensitive search | +| `/datasets/` | Added `tax_benefit_model_name` filter | +| `/parameter-values/` | Added `current` filter | +| Seed script | Deduplicate parameters by name | +| System prompt | Fixed model names (hyphen not underscore) | + +## Measurements + +| Question type | Baseline | After improvements | Target | +|---------------|----------|-------------------|--------| +| Parameter lookup (UK personal allowance) | 10 turns | **3 turns** | 3-4 | +| Household calculation (UK £50k income) | 6 turns | - | 5-6 | + +## Progress log + +- 2024-12-30: Initial setup, created test framework and first batch of questions +- 2024-12-30: Tested personal allowance lookup - 9-10 turns (target: 3-4). Root cause: no country filter on parameter search +- 2024-12-30: Added `tax_benefit_model_name` filter to `/parameters/`, `/variables/`, `/datasets/` +- 2024-12-30: Tested household calc - 6 turns (acceptable). Async polling is the overhead +- 2024-12-30: Discovered duplicate parameters in DB causing extra turns +- 2024-12-30: Fixed model name mismatch (policyengine-uk with hyphen, not underscore) +- 2024-12-30: Added case-insensitive search using ILIKE +- 2024-12-30: Tested personal allowance lookup - **3 turns** (target met!) diff --git a/scripts/seed.py b/scripts/seed.py index 6974ed2..b83c9db 100644 --- a/scripts/seed.py +++ b/scripts/seed.py @@ -1,5 +1,6 @@ """Seed database with UK and US models, variables, parameters, datasets.""" +import argparse import json import logging import math @@ -101,7 +102,7 @@ def bulk_insert(session, table: str, columns: list[str], rows: list[dict]): session.commit() -def seed_model(model_version, session) -> TaxBenefitModelVersion: +def seed_model(model_version, session, lite: bool = False) -> TaxBenefitModelVersion: """Seed a tax-benefit model with its variables and parameters.""" with logfire.span( @@ -205,12 +206,27 @@ def seed_model(model_version, session) -> TaxBenefitModelVersion: f" [green]✓[/green] Added {len(model_version.variables)} variables" ) - # Add parameters (only user-facing ones: those with labels or gov.* params) - parameters_to_add = [p for p in model_version.parameters if p.label is not None] - console.print( - f" Filtered to {len(parameters_to_add)} user-facing parameters " - f"(from {len(model_version.parameters)} total)" - ) + # Add parameters (only user-facing ones: those with labels) + # Deduplicate by name - keep first occurrence + # In lite mode, exclude US state parameters (gov.states.*) + seen_names = set() + parameters_to_add = [] + skipped_state_params = 0 + for p in model_version.parameters: + if p.label is None or p.name in seen_names: + continue + # In lite mode, skip state-level parameters for faster seeding + if lite and p.name.startswith("gov.states."): + skipped_state_params += 1 + continue + parameters_to_add.append(p) + seen_names.add(p.name) + + filter_msg = f" Filtered to {len(parameters_to_add)} user-facing parameters" + filter_msg += f" (from {len(model_version.parameters)} total, deduplicated by name)" + if lite and skipped_state_params > 0: + filter_msg += f", skipped {skipped_state_params} state params (lite mode)" + console.print(filter_msg) with logfire.span("add_parameters", count=len(parameters_to_add)): # Build list of parameter dicts for bulk insert @@ -574,16 +590,25 @@ def seed_example_policies(session): def main(): """Main seed function.""" + parser = argparse.ArgumentParser(description="Seed PolicyEngine database") + parser.add_argument( + "--lite", + action="store_true", + help="Lite mode: skip US state parameters for faster local seeding", + ) + args = parser.parse_args() + with logfire.span("database_seeding"): - console.print("[bold green]PolicyEngine database seeding[/bold green]\n") + mode_str = " (lite mode)" if args.lite else "" + console.print(f"[bold green]PolicyEngine database seeding{mode_str}[/bold green]\n") with next(get_quiet_session()) as session: # Seed UK model - uk_version = seed_model(uk_latest, session) + uk_version = seed_model(uk_latest, session, lite=args.lite) console.print(f"[green]✓[/green] UK model seeded: {uk_version.id}\n") # Seed US model - us_version = seed_model(us_latest, session) + us_version = seed_model(us_latest, session, lite=args.lite) console.print(f"[green]✓[/green] US model seeded: {us_version.id}\n") # Seed datasets diff --git a/src/policyengine_api/agent_sandbox.py b/src/policyengine_api/agent_sandbox.py index 6408093..6488a32 100644 --- a/src/policyengine_api/agent_sandbox.py +++ b/src/policyengine_api/agent_sandbox.py @@ -21,7 +21,7 @@ You have access to the full PolicyEngine API. Key workflows: 1. **Household calculations**: POST to /household/calculate with people array, then poll GET /household/calculate/{job_id} -2. **Parameter lookup**: GET /parameters/ with search query, then GET /parameter-values/ with parameter_id +2. **Parameter lookup**: GET /parameters/ with search query and tax_benefit_model_name, then GET /parameter-values/ with parameter_id 3. **Economic impact**: - GET /parameters/ to find parameter_id - POST /policies/ to create reform with parameter_values @@ -29,6 +29,10 @@ - POST /analysis/economic-impact with policy_id and dataset_id - Poll GET /analysis/economic-impact/{report_id} until completed +When searching for parameters, use tax_benefit_model_name to filter by country: +- "policyengine-uk" for UK parameters +- "policyengine-us" for US parameters + When answering questions: 1. Use the API tools to get accurate, current data 2. Show your calculations clearly diff --git a/src/policyengine_api/api/datasets.py b/src/policyengine_api/api/datasets.py index 12481f0..82540b7 100644 --- a/src/policyengine_api/api/datasets.py +++ b/src/policyengine_api/api/datasets.py @@ -11,24 +11,35 @@ from fastapi import APIRouter, Depends, HTTPException from sqlmodel import Session, select -from policyengine_api.models import Dataset, DatasetRead +from policyengine_api.models import Dataset, DatasetRead, TaxBenefitModel from policyengine_api.services.database import get_session router = APIRouter(prefix="/datasets", tags=["datasets"]) @router.get("/", response_model=List[DatasetRead]) -def list_datasets(session: Session = Depends(get_session)): - """List all available datasets. +def list_datasets( + tax_benefit_model_name: str | None = None, + session: Session = Depends(get_session), +): + """List available datasets. Returns datasets that can be used with the /analysis/economic-impact endpoint. Each dataset represents population microdata for a specific country and year. - USAGE: For UK analysis, look for datasets with names containing "uk" or "frs". - For US analysis, look for datasets with names containing "us" or "cps". - Use the dataset's id when calling /analysis/economic-impact. + Args: + tax_benefit_model_name: Filter by country model. + Use "policyengine-uk" for UK datasets. + Use "policyengine-us" for US datasets. """ - datasets = session.exec(select(Dataset)).all() + query = select(Dataset) + + if tax_benefit_model_name: + query = query.join(TaxBenefitModel).where( + TaxBenefitModel.name == tax_benefit_model_name + ) + + datasets = session.exec(query).all() return datasets diff --git a/src/policyengine_api/api/parameter_values.py b/src/policyengine_api/api/parameter_values.py index c16367f..4668ab8 100644 --- a/src/policyengine_api/api/parameter_values.py +++ b/src/policyengine_api/api/parameter_values.py @@ -5,12 +5,12 @@ when a policy modifies a parameter. """ +from datetime import datetime, timezone from typing import List from uuid import UUID from fastapi import APIRouter, Depends, HTTPException -from fastapi_cache.decorator import cache -from sqlmodel import Session, select +from sqlmodel import Session, or_, select from policyengine_api.models import ParameterValue, ParameterValueRead from policyengine_api.services.database import get_session @@ -19,10 +19,10 @@ @router.get("/", response_model=List[ParameterValueRead]) -@cache(expire=3600) # Cache for 1 hour def list_parameter_values( parameter_id: UUID | None = None, policy_id: UUID | None = None, + current: bool = False, skip: int = 0, limit: int = 100, session: Session = Depends(get_session), @@ -32,8 +32,11 @@ def list_parameter_values( Parameter values store the numeric/string values for policy parameters at specific time periods (start_date to end_date). - Use `parameter_id` to filter by a specific parameter. - Use `policy_id` to filter by a specific policy reform. + Args: + parameter_id: Filter by a specific parameter. + policy_id: Filter by a specific policy reform. + current: If true, only return values that are currently in effect + (start_date <= now and (end_date is null or end_date > now)). """ query = select(ParameterValue) @@ -43,8 +46,18 @@ def list_parameter_values( if policy_id: query = query.where(ParameterValue.policy_id == policy_id) - # Order by start_date ascending so historical/current values come first - query = query.order_by(ParameterValue.start_date.asc()) + if current: + now = datetime.now(timezone.utc) + query = query.where( + ParameterValue.start_date <= now, + or_( + ParameterValue.end_date.is_(None), + ParameterValue.end_date > now, + ), + ) + + # Order by start_date descending so most recent values come first + query = query.order_by(ParameterValue.start_date.desc()) parameter_values = session.exec(query.offset(skip).limit(limit)).all() return parameter_values diff --git a/src/policyengine_api/api/parameters.py b/src/policyengine_api/api/parameters.py index 3bb34bc..db029e5 100644 --- a/src/policyengine_api/api/parameters.py +++ b/src/policyengine_api/api/parameters.py @@ -11,7 +11,12 @@ from fastapi import APIRouter, Depends, HTTPException from sqlmodel import Session, select -from policyengine_api.models import Parameter, ParameterRead +from policyengine_api.models import ( + Parameter, + ParameterRead, + TaxBenefitModel, + TaxBenefitModelVersion, +) from policyengine_api.services.database import get_session router = APIRouter(prefix="/parameters", tags=["parameters"]) @@ -22,6 +27,7 @@ def list_parameters( skip: int = 0, limit: int = 100, search: str | None = None, + tax_benefit_model_name: str | None = None, session: Session = Depends(get_session), ): """List available parameters with pagination and search. @@ -29,16 +35,29 @@ def list_parameters( Parameters are policy levers (e.g. tax rates, thresholds, benefit amounts) that can be modified in reforms. Use parameter names when creating policies. - Use the `search` parameter to filter by parameter name, label, or description. - For example: search="basic_rate" or search="income tax" + Args: + search: Filter by parameter name, label, or description. + tax_benefit_model_name: Filter by country model. + Use "policyengine-uk" for UK parameters. + Use "policyengine-us" for US parameters. """ query = select(Parameter) + # Filter by tax benefit model name (country) + if tax_benefit_model_name: + query = ( + query.join(TaxBenefitModelVersion) + .join(TaxBenefitModel) + .where(TaxBenefitModel.name == tax_benefit_model_name) + ) + if search: + # Case-insensitive search using ILIKE + search_pattern = f"%{search}%" search_filter = ( - Parameter.name.contains(search) - | Parameter.label.contains(search) - | Parameter.description.contains(search) + Parameter.name.ilike(search_pattern) + | Parameter.label.ilike(search_pattern) + | Parameter.description.ilike(search_pattern) ) query = query.where(search_filter) diff --git a/src/policyengine_api/api/variables.py b/src/policyengine_api/api/variables.py index 8da5cf4..a24df44 100644 --- a/src/policyengine_api/api/variables.py +++ b/src/policyengine_api/api/variables.py @@ -9,28 +9,61 @@ from uuid import UUID from fastapi import APIRouter, Depends, HTTPException -from fastapi_cache.decorator import cache from sqlmodel import Session, select -from policyengine_api.models import Variable, VariableRead +from policyengine_api.models import ( + TaxBenefitModel, + TaxBenefitModelVersion, + Variable, + VariableRead, +) from policyengine_api.services.database import get_session router = APIRouter(prefix="/variables", tags=["variables"]) @router.get("/", response_model=List[VariableRead]) -@cache(expire=3600) # Cache for 1 hour def list_variables( - skip: int = 0, limit: int = 100, session: Session = Depends(get_session) + skip: int = 0, + limit: int = 100, + search: str | None = None, + tax_benefit_model_name: str | None = None, + session: Session = Depends(get_session), ): - """List available variables with pagination. + """List available variables with pagination and search. Variables are inputs (e.g. employment_income, age) and outputs (e.g. income_tax, household_net_income) of tax-benefit calculations. Use variable names in household calculation requests. + + Args: + search: Filter by variable name, label, or description. + tax_benefit_model_name: Filter by country model. + Use "policyengine-uk" for UK variables. + Use "policyengine-us" for US variables. """ + query = select(Variable) + + # Filter by tax benefit model name (country) + if tax_benefit_model_name: + query = ( + query.join(TaxBenefitModelVersion) + .join(TaxBenefitModel) + .where(TaxBenefitModel.name == tax_benefit_model_name) + ) + + if search: + # Case-insensitive search using ILIKE + search_pattern = f"%{search}%" + search_filter = ( + Variable.name.ilike(search_pattern) + | Variable.label.ilike(search_pattern) + | Variable.description.ilike(search_pattern) + ) + query = query.where(search_filter) + variables = session.exec( - select(Variable).order_by(Variable.name).offset(skip).limit(limit) + query.order_by(Variable.name).offset(skip).limit(limit) ).all() return variables diff --git a/supabase/.temp/cli-latest b/supabase/.temp/cli-latest index 1b2faa2..8c68db7 100644 --- a/supabase/.temp/cli-latest +++ b/supabase/.temp/cli-latest @@ -1 +1 @@ -v2.65.5 \ No newline at end of file +v2.67.1 \ No newline at end of file diff --git a/tests/test_agent_policy_questions.py b/tests/test_agent_policy_questions.py new file mode 100644 index 0000000..1550f89 --- /dev/null +++ b/tests/test_agent_policy_questions.py @@ -0,0 +1,221 @@ +"""Integration tests for agent policy questions. + +These tests run real agent queries and measure turn counts. +Run with: pytest tests/test_agent_policy_questions.py -v -s + +The goal is to track agent performance and identify opportunities +to improve API metadata/documentation to reduce turns needed. +""" + +import pytest + +pytestmark = pytest.mark.integration + +from policyengine_api.agent_sandbox import _run_agent_impl + +import os + +# Use local API by default, override with POLICYENGINE_API_URL env var +API_BASE = os.environ.get("POLICYENGINE_API_URL", "http://localhost:8000") + + +class TestParameterLookup: + """Parameter lookup questions - should complete in 2-4 turns.""" + + def test_uk_personal_allowance(self): + """UK personal allowance lookup.""" + result = _run_agent_impl( + "What is the current UK personal allowance?", + api_base_url=API_BASE, + max_turns=10, + ) + assert result["status"] == "completed" + assert result["result"] is not None + print(f"\nTurns: {result['turns']}") + print(f"Result: {result['result'][:500]}") + + def test_uk_higher_rate_threshold(self): + """UK higher rate threshold lookup.""" + result = _run_agent_impl( + "At what income level does the UK higher rate (40%) tax band start?", + api_base_url=API_BASE, + max_turns=10, + ) + assert result["status"] == "completed" + assert result["result"] is not None + print(f"\nTurns: {result['turns']}") + print(f"Result: {result['result'][:500]}") + + def test_us_standard_deduction(self): + """US standard deduction lookup.""" + result = _run_agent_impl( + "What is the US federal standard deduction for a single filer?", + api_base_url=API_BASE, + max_turns=10, + ) + assert result["status"] == "completed" + assert result["result"] is not None + print(f"\nTurns: {result['turns']}") + print(f"Result: {result['result'][:500]}") + + +class TestUKHouseholdSimple: + """Simple UK household questions - should complete in 3-5 turns.""" + + def test_income_tax_calculation(self): + """Basic income tax calculation.""" + result = _run_agent_impl( + "What is my income tax if I earn £50,000 per year in the UK?", + api_base_url=API_BASE, + max_turns=10, + ) + assert result["status"] == "completed" + assert result["result"] is not None + assert "£" in result["result"] or "GBP" in result["result"] + print(f"\nTurns: {result['turns']}") + print(f"Result: {result['result'][:500]}") + + def test_child_benefit_lookup(self): + """Child benefit for a family.""" + result = _run_agent_impl( + "How much child benefit would a UK family with 2 children receive per week?", + api_base_url=API_BASE, + max_turns=10, + ) + assert result["status"] == "completed" + assert result["result"] is not None + print(f"\nTurns: {result['turns']}") + print(f"Result: {result['result'][:500]}") + + +class TestUKHouseholdComplex: + """Complex UK household questions - may need 5-10 turns.""" + + def test_marginal_rate_at_100k(self): + """Marginal tax rate calculation at £100k (60% trap).""" + result = _run_agent_impl( + "What is the effective marginal tax rate for someone earning £100,000 in the UK? " + "Include the personal allowance taper.", + api_base_url=API_BASE, + max_turns=15, + ) + assert result["status"] == "completed" + assert result["result"] is not None + print(f"\nTurns: {result['turns']}") + print(f"Result: {result['result'][:500]}") + + def test_reform_comparison(self): + """Compare baseline vs reform for a household.""" + result = _run_agent_impl( + "Compare the net income for someone earning £40,000 under current UK tax law " + "versus if the basic rate of income tax was 25% instead of 20%.", + api_base_url=API_BASE, + max_turns=15, + ) + assert result["status"] == "completed" + assert result["result"] is not None + print(f"\nTurns: {result['turns']}") + print(f"Result: {result['result'][:500]}") + + +class TestUSHouseholdSimple: + """Simple US household questions - should complete in 3-5 turns.""" + + def test_federal_income_tax(self): + """Basic federal income tax calculation.""" + result = _run_agent_impl( + "What is my federal income tax if I earn $75,000 per year in the US?", + api_base_url=API_BASE, + max_turns=10, + ) + assert result["status"] == "completed" + assert result["result"] is not None + assert "$" in result["result"] or "USD" in result["result"] + print(f"\nTurns: {result['turns']}") + print(f"Result: {result['result'][:500]}") + + def test_snap_eligibility(self): + """SNAP benefit calculation.""" + result = _run_agent_impl( + "How much SNAP (food stamps) would a family of 4 with $30,000 annual income " + "receive in the US?", + api_base_url=API_BASE, + max_turns=10, + ) + assert result["status"] == "completed" + assert result["result"] is not None + print(f"\nTurns: {result['turns']}") + print(f"Result: {result['result'][:500]}") + + +class TestUSHouseholdComplex: + """Complex US household questions - may need 5-10 turns.""" + + def test_eitc_calculation(self): + """EITC with children calculation.""" + result = _run_agent_impl( + "Calculate the Earned Income Tax Credit for a single parent with 2 children " + "earning $25,000 per year in the US.", + api_base_url=API_BASE, + max_turns=15, + ) + assert result["status"] == "completed" + assert result["result"] is not None + print(f"\nTurns: {result['turns']}") + print(f"Result: {result['result'][:500]}") + + +class TestEconomyWide: + """Economy-wide analysis questions - budgetary impacts, distributional analysis.""" + + def test_uk_policy_budgetary_impact(self): + """UK policy reform budgetary impact.""" + result = _run_agent_impl( + "What would be the budgetary impact of raising the UK personal allowance to £15,000?", + api_base_url=API_BASE, + max_turns=20, + ) + assert result["status"] == "completed" + assert result["result"] is not None + print(f"\nTurns: {result['turns']}") + print(f"Result: {result['result'][:500]}") + + def test_us_policy_winners_losers(self): + """US policy reform winners and losers.""" + result = _run_agent_impl( + "If the US doubled the Child Tax Credit, which income deciles would benefit most?", + api_base_url=API_BASE, + max_turns=20, + ) + assert result["status"] == "completed" + assert result["result"] is not None + print(f"\nTurns: {result['turns']}") + print(f"Result: {result['result'][:500]}") + + +class TestTurnCounting: + """Tests specifically to measure turn efficiency.""" + + @pytest.mark.parametrize( + "question,max_expected_turns", + [ + ("What is the UK personal allowance?", 5), + ("What is the US standard deduction?", 5), + ("Calculate income tax for £30,000 UK salary", 6), + ("Calculate federal income tax for $50,000 US salary", 6), + ], + ) + def test_turn_efficiency(self, question, max_expected_turns): + """Verify agent completes within expected turn count.""" + result = _run_agent_impl( + question, + api_base_url=API_BASE, + max_turns=max_expected_turns + 5, + ) + assert result["status"] == "completed" + print(f"\nQuestion: {question}") + print(f"Turns: {result['turns']} (max expected: {max_expected_turns})") + print(f"Result: {result['result'][:300]}") + + if result["turns"] > max_expected_turns: + print(f"WARNING: Took {result['turns']} turns, expected <= {max_expected_turns}")