diff --git a/Makefile b/Makefile
index b516274..9396a93 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: install dev format lint test integration-test clean seed up down logs start-supabase stop-supabase reset rebuild create-state-bucket deploy-local init db-reset-prod modal-deploy modal-serve docs
+.PHONY: install dev format lint test integration-test clean seed seed-full up down logs start-supabase stop-supabase rebuild create-state-bucket deploy-local init db-reset-local db-reseed-local db-reset-prod db-reseed-prod modal-deploy modal-serve docs
 
 # AWS Configuration
 AWS_REGION ?= us-east-1
@@ -25,8 +25,8 @@ integration-test:
 	@supabase start || true
 	@echo "2. Initialising database..."
 	@echo "yes" | uv run python scripts/init.py
-	@echo "3. Running seed script..."
-	@uv run python scripts/seed.py
+	@echo "3. Running seed script (lite mode)..."
+	@uv run python scripts/seed.py --lite
 	@echo "4. Running integration tests..."
 	@pytest tests/test_integration.py -v --tb=short
 	@echo "✓ Integration tests complete!"
@@ -40,9 +40,18 @@ clean:
 	find . -type f -name "*.pyc" -delete
 	find . -type d -name "*.egg-info" -exec rm -rf {} + 2>/dev/null || true
 
-reset:
-	@echo "Resetting Supabase database..."
-	supabase db reset
+db-reset-local:
+	@echo "Resetting and reseeding LOCAL database..."
+	@echo "1. Initialising database (drops and recreates tables)..."
+	@echo "yes" | uv run python scripts/init.py
+	@echo "2. Seeding data (lite mode)..."
+	@uv run python scripts/seed.py --lite
+	@echo "✓ Local database reset and seeded!"
+
+db-reseed-local:
+	@echo "Reseeding LOCAL database (lite mode, keeps existing tables)..."
+	@uv run python scripts/seed.py --lite
+	@echo "✓ Local database reseeded!"
 
 rebuild:
 	@echo "Rebuilding Docker containers..."
@@ -52,7 +61,11 @@ rebuild:
 	@echo "✓ Rebuild complete!"
 
 seed:
-	@echo "Seeding database with UK and US models..."
+	@echo "Seeding database with UK and US models (lite mode)..."
+	uv run python scripts/seed.py --lite
+
+seed-full:
+	@echo "Seeding database with UK and US models (full)..."
 	uv run python scripts/seed.py
 
 start-supabase:
@@ -113,6 +126,22 @@ db-reset-prod:
 		exit 1; \
 	fi
 
+db-reseed-prod:
+	@echo "⚠️  WARNING: This will reseed the PRODUCTION database ⚠️"
+	@echo "This will add/update models, parameters, and datasets."
+	@echo "Existing data will be preserved where possible."
+	@echo ""
+	@read -p "Are you sure you want to continue? Type 'yes' to confirm: " -r CONFIRM; \
+	echo; \
+	if [ "$$CONFIRM" = "yes" ]; then \
+		echo "Reseeding production database..."; \
+		set -a && . .env.prod && set +a && \
+		uv run python scripts/seed.py; \
+	else \
+		echo "Aborted."; \
+		exit 1; \
+	fi
+
 modal-deploy:
 	@echo "Deploying Modal functions..."
 	@set -a && . .env.prod && set +a && \
diff --git a/docs/AGENT_TESTING.md b/docs/AGENT_TESTING.md
new file mode 100644
index 0000000..cccb37c
--- /dev/null
+++ b/docs/AGENT_TESTING.md
@@ -0,0 +1,165 @@
+# Agent testing and optimisation
+
+This document tracks ongoing work to test and improve the PolicyEngine agent's ability to answer policy questions efficiently.
+
+## Goal
+
+Minimise the number of turns the agent needs to answer policy questions by improving API metadata, documentation, and structure - not by hacking for specific test cases.
+
+## Test categories
+
+We want comprehensive coverage across:
+- **Country**: UK and US
+- **Scope**: Household (single family) and Economy (population-wide)
+- **Complexity**: Simple (single variable lookup) to Complex (multi-step reforms)
+
+## Example questions to test
+
+### UK Household (simple)
+- "What is my income tax if I earn £50,000?"
+- "How much child benefit would a family with 2 children receive?"
+
+### UK Household (complex)
+- "Compare my net income under current law vs if the basic rate was 25%"
+- "What's the marginal tax rate for someone earning £100,000?"
+
+### UK Economy (simple)
+- "What's the total cost of child benefit?"
+- "How many people pay higher rate tax?"
+
+### UK Economy (complex)
+- "What would be the budgetary impact of raising the personal allowance to £15,000?"
+- "How would a £500 UBI affect poverty rates?"
+
+### US Household (simple)
+- "What is my federal income tax if I earn $75,000?"
+- "How much SNAP would a family of 4 with $30,000 income receive?"
+
+### US Household (complex)
+- "Compare my benefits under current law vs doubling the EITC"
+- "What's my marginal tax rate including state taxes in California?"
+
+### US Economy (simple)
+- "What's the total cost of SNAP?"
+- "How many households receive the EITC?"
+
+### US Economy (complex)
+- "What would be the budgetary impact of expanding the Child Tax Credit to $3,600?"
+- "How would eliminating the SALT cap affect different income deciles?"
+
+## Current agent architecture
+
+The agent uses Claude Code in a Modal sandbox with:
+- System prompt containing API documentation (see `src/policyengine_api/prompts/`)
+- Direct HTTP calls via curl to the PolicyEngine API
+- No MCP (it was causing issues in Modal containers)
+
+## Optimisation strategies
+
+1. **Improve system prompt** - Make API usage clearer, provide more examples
+2. **Add API response examples** - Show what successful responses look like
+3. **Parameter documentation** - Ensure all parameters are well-documented with valid values
+4. **Error messages** - Make error messages actionable so agent can self-correct
+5. **Endpoint discoverability** - Help agent find the right endpoint quickly
+
+## Test file location
+
+Tests are in `tests/test_agent_policy_questions.py` (integration tests requiring Modal).
+
+## How to continue this work
+
+1. Run existing tests: `pytest tests/test_agent_policy_questions.py -v -s`
+2. Check agent logs in Logfire for turn counts and errors
+3. Identify common failure patterns
+4. Improve prompts/metadata to address failures
+5. Add new test cases as coverage expands
+
+## Observed issues
+
+### Issue 1: Parameter search doesn't filter by country (9 turns for personal allowance)
+
+**Problem**: When searching for "personal allowance", the agent gets US results (Illinois AABD) mixed with UK results. It took 9 turns to find the UK personal allowance.
+
+**Agent's failed searches**:
+1. "personal allowance" → Illinois AABD (US)
+2. "income tax personal allowance" → empty
+3. "income_tax" → US CBO parameters
+4. "basic rate" → UK CGT (closer!)
+5. "allowance" → California SSI (US)
+6. "hmrc income_tax allowances personal" → empty
+7. "hmrc.income_tax.allowances" → found it!
+
+**Solution implemented**:
+- Added `tax_benefit_model_name` filter to `/parameters/` endpoint
+- Updated system prompt to instruct agent to use country filter
+
+**NOT acceptable solutions** (test hacking):
+- Adding specific parameter name examples to system prompt
+- Telling agent exactly what to search for
+
+### Issue 2: Duplicate parameters in database
+
+**Problem**: Same parameter name exists with multiple IDs. One has values, one doesn't. Agent picks wrong one first.
+
+**Example**: `gov.hmrc.income_tax.allowances.personal_allowance.amount` has two entries with different IDs.
+
+**Solution implemented**: Deduplicate parameters by name in seed script (`seen_names` set).
+
+### Issue 6: Case-sensitive search
+
+**Problem**: Search for "personal allowance" didn't find "Personal allowance" (capital P).
+
+**Solution implemented**: Changed search to use `ILIKE` instead of `contains` for case-insensitive matching.
+
+### Issue 7: Model name mismatch
+
+**Problem**: System prompt said `policyengine_uk` but database has `policyengine-uk` (hyphen vs underscore).
+
+**Solution implemented**: Updated system prompt and API docstrings to use correct model names with hyphens.
+
+### Issue 3: Variables endpoint lacks search
+
+**Problem**: `/variables/` had no search or country filter. Agent can't discover variable names.
+
+**Solution implemented**: Added `search` and `tax_benefit_model_name` filters to `/variables/`.
+
+### Issue 4: Datasets endpoint lacks country filter
+
+**Problem**: `/datasets/` returned all datasets, mixing UK and US.
+
+**Solution implemented**: Added `tax_benefit_model_name` filter to `/datasets/`.
+
+### Issue 5: Parameter values lack "current" filter
+
+**Problem**: Agent had to parse through all historical values to find current one.
+
+**Solution implemented**: Added `current=true` filter to `/parameter-values/` endpoint.
+
+## API improvements summary
+
+| Endpoint | Improvement |
+|----------|-------------|
+| `/parameters/` | Added `tax_benefit_model_name` filter, case-insensitive search |
+| `/variables/` | Added `search` and `tax_benefit_model_name` filters, case-insensitive search |
+| `/datasets/` | Added `tax_benefit_model_name` filter |
+| `/parameter-values/` | Added `current` filter |
+| Seed script | Deduplicate parameters by name |
+| System prompt | Fixed model names (hyphen not underscore) |
+
+## Measurements
+
+| Question type | Baseline | After improvements | Target |
+|---------------|----------|-------------------|--------|
+| Parameter lookup (UK personal allowance) | 10 turns | **3 turns** | 3-4 |
+| Household calculation (UK £50k income) | 6 turns | - | 5-6 |
+
+## Progress log
+
+- 2024-12-30: Initial setup, created test framework and first batch of questions
+- 2024-12-30: Tested personal allowance lookup - 9-10 turns (target: 3-4). Root cause: no country filter on parameter search
+- 2024-12-30: Added `tax_benefit_model_name` filter to `/parameters/`, `/variables/`, `/datasets/`
+- 2024-12-30: Tested household calc - 6 turns (acceptable). Async polling is the overhead
+- 2024-12-30: Discovered duplicate parameters in DB causing extra turns
+- 2024-12-30: Fixed model name mismatch (policyengine-uk with hyphen, not underscore)
+- 2024-12-30: Added case-insensitive search using ILIKE
+- 2024-12-30: Tested personal allowance lookup - **3 turns** (target met!)
diff --git a/scripts/seed.py b/scripts/seed.py
index 6974ed2..b83c9db 100644
--- a/scripts/seed.py
+++ b/scripts/seed.py
@@ -1,5 +1,6 @@
 """Seed database with UK and US models, variables, parameters, datasets."""
 
+import argparse
 import json
 import logging
 import math
@@ -101,7 +102,7 @@ def bulk_insert(session, table: str, columns: list[str], rows: list[dict]):
     session.commit()
 
 
-def seed_model(model_version, session) -> TaxBenefitModelVersion:
+def seed_model(model_version, session, lite: bool = False) -> TaxBenefitModelVersion:
     """Seed a tax-benefit model with its variables and parameters."""
 
     with logfire.span(
@@ -205,12 +206,27 @@ def seed_model(model_version, session) -> TaxBenefitModelVersion:
                 f"  [green]✓[/green] Added {len(model_version.variables)} variables"
             )
 
-        # Add parameters (only user-facing ones: those with labels or gov.* params)
-        parameters_to_add = [p for p in model_version.parameters if p.label is not None]
-        console.print(
-            f"  Filtered to {len(parameters_to_add)} user-facing parameters "
-            f"(from {len(model_version.parameters)} total)"
-        )
+        # Add parameters (only user-facing ones: those with labels)
+        # Deduplicate by name - keep first occurrence
+        # In lite mode, exclude US state parameters (gov.states.*)
+        seen_names = set()
+        parameters_to_add = []
+        skipped_state_params = 0
+        for p in model_version.parameters:
+            if p.label is None or p.name in seen_names:
+                continue
+            # In lite mode, skip state-level parameters for faster seeding
+            if lite and p.name.startswith("gov.states."):
+                skipped_state_params += 1
+                continue
+            parameters_to_add.append(p)
+            seen_names.add(p.name)
+
+        filter_msg = f"  Filtered to {len(parameters_to_add)} user-facing parameters"
+        filter_msg += f" (from {len(model_version.parameters)} total, deduplicated by name)"
+        if lite and skipped_state_params > 0:
+            filter_msg += f", skipped {skipped_state_params} state params (lite mode)"
+        console.print(filter_msg)
 
         with logfire.span("add_parameters", count=len(parameters_to_add)):
             # Build list of parameter dicts for bulk insert
@@ -574,16 +590,25 @@ def seed_example_policies(session):
 
 def main():
     """Main seed function."""
+    parser = argparse.ArgumentParser(description="Seed PolicyEngine database")
+    parser.add_argument(
+        "--lite",
+        action="store_true",
+        help="Lite mode: skip US state parameters for faster local seeding",
+    )
+    args = parser.parse_args()
+
     with logfire.span("database_seeding"):
-        console.print("[bold green]PolicyEngine database seeding[/bold green]\n")
+        mode_str = " (lite mode)" if args.lite else ""
+        console.print(f"[bold green]PolicyEngine database seeding{mode_str}[/bold green]\n")
 
         with next(get_quiet_session()) as session:
             # Seed UK model
-            uk_version = seed_model(uk_latest, session)
+            uk_version = seed_model(uk_latest, session, lite=args.lite)
             console.print(f"[green]✓[/green] UK model seeded: {uk_version.id}\n")
 
             # Seed US model
-            us_version = seed_model(us_latest, session)
+            us_version = seed_model(us_latest, session, lite=args.lite)
             console.print(f"[green]✓[/green] US model seeded: {us_version.id}\n")
 
             # Seed datasets
diff --git a/src/policyengine_api/agent_sandbox.py b/src/policyengine_api/agent_sandbox.py
index 6408093..6488a32 100644
--- a/src/policyengine_api/agent_sandbox.py
+++ b/src/policyengine_api/agent_sandbox.py
@@ -21,7 +21,7 @@
 You have access to the full PolicyEngine API. Key workflows:
 
 1. **Household calculations**: POST to /household/calculate with people array, then poll GET /household/calculate/{job_id}
-2. **Parameter lookup**: GET /parameters/ with search query, then GET /parameter-values/ with parameter_id
+2. **Parameter lookup**: GET /parameters/ with search query and tax_benefit_model_name, then GET /parameter-values/ with parameter_id
 3. **Economic impact**:
    - GET /parameters/ to find parameter_id
    - POST /policies/ to create reform with parameter_values
@@ -29,6 +29,10 @@
    - POST /analysis/economic-impact with policy_id and dataset_id
    - Poll GET /analysis/economic-impact/{report_id} until completed
 
+When searching for parameters, use tax_benefit_model_name to filter by country:
+- "policyengine-uk" for UK parameters
+- "policyengine-us" for US parameters
+
 When answering questions:
 1. Use the API tools to get accurate, current data
 2. Show your calculations clearly
diff --git a/src/policyengine_api/api/datasets.py b/src/policyengine_api/api/datasets.py
index 12481f0..82540b7 100644
--- a/src/policyengine_api/api/datasets.py
+++ b/src/policyengine_api/api/datasets.py
@@ -11,24 +11,35 @@
 from fastapi import APIRouter, Depends, HTTPException
 from sqlmodel import Session, select
 
-from policyengine_api.models import Dataset, DatasetRead
+from policyengine_api.models import Dataset, DatasetRead, TaxBenefitModel
 from policyengine_api.services.database import get_session
 
 router = APIRouter(prefix="/datasets", tags=["datasets"])
 
 
 @router.get("/", response_model=List[DatasetRead])
-def list_datasets(session: Session = Depends(get_session)):
-    """List all available datasets.
+def list_datasets(
+    tax_benefit_model_name: str | None = None,
+    session: Session = Depends(get_session),
+):
+    """List available datasets.
 
     Returns datasets that can be used with the /analysis/economic-impact endpoint.
     Each dataset represents population microdata for a specific country and year.
 
-    USAGE: For UK analysis, look for datasets with names containing "uk" or "frs".
-    For US analysis, look for datasets with names containing "us" or "cps".
-    Use the dataset's id when calling /analysis/economic-impact.
+    Args:
+        tax_benefit_model_name: Filter by country model.
+            Use "policyengine-uk" for UK datasets.
+            Use "policyengine-us" for US datasets.
     """
-    datasets = session.exec(select(Dataset)).all()
+    query = select(Dataset)
+
+    if tax_benefit_model_name:
+        query = query.join(TaxBenefitModel).where(
+            TaxBenefitModel.name == tax_benefit_model_name
+        )
+
+    datasets = session.exec(query).all()
     return datasets
 
 
diff --git a/src/policyengine_api/api/parameter_values.py b/src/policyengine_api/api/parameter_values.py
index c16367f..4668ab8 100644
--- a/src/policyengine_api/api/parameter_values.py
+++ b/src/policyengine_api/api/parameter_values.py
@@ -5,12 +5,12 @@
 when a policy modifies a parameter.
 """
 
+from datetime import datetime, timezone
 from typing import List
 from uuid import UUID
 
 from fastapi import APIRouter, Depends, HTTPException
-from fastapi_cache.decorator import cache
-from sqlmodel import Session, select
+from sqlmodel import Session, or_, select
 
 from policyengine_api.models import ParameterValue, ParameterValueRead
 from policyengine_api.services.database import get_session
@@ -19,10 +19,10 @@
 
 
 @router.get("/", response_model=List[ParameterValueRead])
-@cache(expire=3600)  # Cache for 1 hour
 def list_parameter_values(
     parameter_id: UUID | None = None,
     policy_id: UUID | None = None,
+    current: bool = False,
     skip: int = 0,
     limit: int = 100,
     session: Session = Depends(get_session),
@@ -32,8 +32,11 @@ def list_parameter_values(
     Parameter values store the numeric/string values for policy parameters
     at specific time periods (start_date to end_date).
 
-    Use `parameter_id` to filter by a specific parameter.
-    Use `policy_id` to filter by a specific policy reform.
+    Args:
+        parameter_id: Filter by a specific parameter.
+        policy_id: Filter by a specific policy reform.
+        current: If true, only return values that are currently in effect
+            (start_date <= now and (end_date is null or end_date > now)).
     """
     query = select(ParameterValue)
 
@@ -43,8 +46,18 @@ def list_parameter_values(
     if policy_id:
         query = query.where(ParameterValue.policy_id == policy_id)
 
-    # Order by start_date ascending so historical/current values come first
-    query = query.order_by(ParameterValue.start_date.asc())
+    if current:
+        now = datetime.now(timezone.utc)
+        query = query.where(
+            ParameterValue.start_date <= now,
+            or_(
+                ParameterValue.end_date.is_(None),
+                ParameterValue.end_date > now,
+            ),
+        )
+
+    # Order by start_date descending so most recent values come first
+    query = query.order_by(ParameterValue.start_date.desc())
 
     parameter_values = session.exec(query.offset(skip).limit(limit)).all()
     return parameter_values
diff --git a/src/policyengine_api/api/parameters.py b/src/policyengine_api/api/parameters.py
index 3bb34bc..db029e5 100644
--- a/src/policyengine_api/api/parameters.py
+++ b/src/policyengine_api/api/parameters.py
@@ -11,7 +11,12 @@
 from fastapi import APIRouter, Depends, HTTPException
 from sqlmodel import Session, select
 
-from policyengine_api.models import Parameter, ParameterRead
+from policyengine_api.models import (
+    Parameter,
+    ParameterRead,
+    TaxBenefitModel,
+    TaxBenefitModelVersion,
+)
 from policyengine_api.services.database import get_session
 
 router = APIRouter(prefix="/parameters", tags=["parameters"])
@@ -22,6 +27,7 @@ def list_parameters(
     skip: int = 0,
     limit: int = 100,
     search: str | None = None,
+    tax_benefit_model_name: str | None = None,
     session: Session = Depends(get_session),
 ):
     """List available parameters with pagination and search.
@@ -29,16 +35,29 @@ def list_parameters(
     Parameters are policy levers (e.g. tax rates, thresholds, benefit amounts)
     that can be modified in reforms. Use parameter names when creating policies.
 
-    Use the `search` parameter to filter by parameter name, label, or description.
-    For example: search="basic_rate" or search="income tax"
+    Args:
+        search: Filter by parameter name, label, or description.
+        tax_benefit_model_name: Filter by country model.
+            Use "policyengine-uk" for UK parameters.
+            Use "policyengine-us" for US parameters.
     """
     query = select(Parameter)
 
+    # Filter by tax benefit model name (country)
+    if tax_benefit_model_name:
+        query = (
+            query.join(TaxBenefitModelVersion)
+            .join(TaxBenefitModel)
+            .where(TaxBenefitModel.name == tax_benefit_model_name)
+        )
+
     if search:
+        # Case-insensitive search using ILIKE
+        search_pattern = f"%{search}%"
         search_filter = (
-            Parameter.name.contains(search)
-            | Parameter.label.contains(search)
-            | Parameter.description.contains(search)
+            Parameter.name.ilike(search_pattern)
+            | Parameter.label.ilike(search_pattern)
+            | Parameter.description.ilike(search_pattern)
         )
         query = query.where(search_filter)
 
diff --git a/src/policyengine_api/api/variables.py b/src/policyengine_api/api/variables.py
index 8da5cf4..a24df44 100644
--- a/src/policyengine_api/api/variables.py
+++ b/src/policyengine_api/api/variables.py
@@ -9,28 +9,61 @@
 from uuid import UUID
 
 from fastapi import APIRouter, Depends, HTTPException
-from fastapi_cache.decorator import cache
 from sqlmodel import Session, select
 
-from policyengine_api.models import Variable, VariableRead
+from policyengine_api.models import (
+    TaxBenefitModel,
+    TaxBenefitModelVersion,
+    Variable,
+    VariableRead,
+)
 from policyengine_api.services.database import get_session
 
 router = APIRouter(prefix="/variables", tags=["variables"])
 
 
 @router.get("/", response_model=List[VariableRead])
-@cache(expire=3600)  # Cache for 1 hour
 def list_variables(
-    skip: int = 0, limit: int = 100, session: Session = Depends(get_session)
+    skip: int = 0,
+    limit: int = 100,
+    search: str | None = None,
+    tax_benefit_model_name: str | None = None,
+    session: Session = Depends(get_session),
 ):
-    """List available variables with pagination.
+    """List available variables with pagination and search.
 
     Variables are inputs (e.g. employment_income, age) and outputs (e.g. income_tax,
     household_net_income) of tax-benefit calculations. Use variable names in
     household calculation requests.
+
+    Args:
+        search: Filter by variable name, label, or description.
+        tax_benefit_model_name: Filter by country model.
+            Use "policyengine-uk" for UK variables.
+            Use "policyengine-us" for US variables.
     """
+    query = select(Variable)
+
+    # Filter by tax benefit model name (country)
+    if tax_benefit_model_name:
+        query = (
+            query.join(TaxBenefitModelVersion)
+            .join(TaxBenefitModel)
+            .where(TaxBenefitModel.name == tax_benefit_model_name)
+        )
+
+    if search:
+        # Case-insensitive search using ILIKE
+        search_pattern = f"%{search}%"
+        search_filter = (
+            Variable.name.ilike(search_pattern)
+            | Variable.label.ilike(search_pattern)
+            | Variable.description.ilike(search_pattern)
+        )
+        query = query.where(search_filter)
+
     variables = session.exec(
-        select(Variable).order_by(Variable.name).offset(skip).limit(limit)
+        query.order_by(Variable.name).offset(skip).limit(limit)
     ).all()
     return variables
 
diff --git a/supabase/.temp/cli-latest b/supabase/.temp/cli-latest
index 1b2faa2..8c68db7 100644
--- a/supabase/.temp/cli-latest
+++ b/supabase/.temp/cli-latest
@@ -1 +1 @@
-v2.65.5
\ No newline at end of file
+v2.67.1
\ No newline at end of file
diff --git a/tests/test_agent_policy_questions.py b/tests/test_agent_policy_questions.py
new file mode 100644
index 0000000..1550f89
--- /dev/null
+++ b/tests/test_agent_policy_questions.py
@@ -0,0 +1,221 @@
+"""Integration tests for agent policy questions.
+
+These tests run real agent queries and measure turn counts.
+Run with: pytest tests/test_agent_policy_questions.py -v -s
+
+The goal is to track agent performance and identify opportunities
+to improve API metadata/documentation to reduce turns needed.
+"""
+
+import pytest
+
+pytestmark = pytest.mark.integration
+
+from policyengine_api.agent_sandbox import _run_agent_impl
+
+import os
+
+# Use local API by default, override with POLICYENGINE_API_URL env var
+API_BASE = os.environ.get("POLICYENGINE_API_URL", "http://localhost:8000")
+
+
+class TestParameterLookup:
+    """Parameter lookup questions - should complete in 2-4 turns."""
+
+    def test_uk_personal_allowance(self):
+        """UK personal allowance lookup."""
+        result = _run_agent_impl(
+            "What is the current UK personal allowance?",
+            api_base_url=API_BASE,
+            max_turns=10,
+        )
+        assert result["status"] == "completed"
+        assert result["result"] is not None
+        print(f"\nTurns: {result['turns']}")
+        print(f"Result: {result['result'][:500]}")
+
+    def test_uk_higher_rate_threshold(self):
+        """UK higher rate threshold lookup."""
+        result = _run_agent_impl(
+            "At what income level does the UK higher rate (40%) tax band start?",
+            api_base_url=API_BASE,
+            max_turns=10,
+        )
+        assert result["status"] == "completed"
+        assert result["result"] is not None
+        print(f"\nTurns: {result['turns']}")
+        print(f"Result: {result['result'][:500]}")
+
+    def test_us_standard_deduction(self):
+        """US standard deduction lookup."""
+        result = _run_agent_impl(
+            "What is the US federal standard deduction for a single filer?",
+            api_base_url=API_BASE,
+            max_turns=10,
+        )
+        assert result["status"] == "completed"
+        assert result["result"] is not None
+        print(f"\nTurns: {result['turns']}")
+        print(f"Result: {result['result'][:500]}")
+
+
+class TestUKHouseholdSimple:
+    """Simple UK household questions - should complete in 3-5 turns."""
+
+    def test_income_tax_calculation(self):
+        """Basic income tax calculation."""
+        result = _run_agent_impl(
+            "What is my income tax if I earn £50,000 per year in the UK?",
+            api_base_url=API_BASE,
+            max_turns=10,
+        )
+        assert result["status"] == "completed"
+        assert result["result"] is not None
+        assert "£" in result["result"] or "GBP" in result["result"]
+        print(f"\nTurns: {result['turns']}")
+        print(f"Result: {result['result'][:500]}")
+
+    def test_child_benefit_lookup(self):
+        """Child benefit for a family."""
+        result = _run_agent_impl(
+            "How much child benefit would a UK family with 2 children receive per week?",
+            api_base_url=API_BASE,
+            max_turns=10,
+        )
+        assert result["status"] == "completed"
+        assert result["result"] is not None
+        print(f"\nTurns: {result['turns']}")
+        print(f"Result: {result['result'][:500]}")
+
+
+class TestUKHouseholdComplex:
+    """Complex UK household questions - may need 5-10 turns."""
+
+    def test_marginal_rate_at_100k(self):
+        """Marginal tax rate calculation at £100k (60% trap)."""
+        result = _run_agent_impl(
+            "What is the effective marginal tax rate for someone earning £100,000 in the UK? "
+            "Include the personal allowance taper.",
+            api_base_url=API_BASE,
+            max_turns=15,
+        )
+        assert result["status"] == "completed"
+        assert result["result"] is not None
+        print(f"\nTurns: {result['turns']}")
+        print(f"Result: {result['result'][:500]}")
+
+    def test_reform_comparison(self):
+        """Compare baseline vs reform for a household."""
+        result = _run_agent_impl(
+            "Compare the net income for someone earning £40,000 under current UK tax law "
+            "versus if the basic rate of income tax was 25% instead of 20%.",
+            api_base_url=API_BASE,
+            max_turns=15,
+        )
+        assert result["status"] == "completed"
+        assert result["result"] is not None
+        print(f"\nTurns: {result['turns']}")
+        print(f"Result: {result['result'][:500]}")
+
+
+class TestUSHouseholdSimple:
+    """Simple US household questions - should complete in 3-5 turns."""
+
+    def test_federal_income_tax(self):
+        """Basic federal income tax calculation."""
+        result = _run_agent_impl(
+            "What is my federal income tax if I earn $75,000 per year in the US?",
+            api_base_url=API_BASE,
+            max_turns=10,
+        )
+        assert result["status"] == "completed"
+        assert result["result"] is not None
+        assert "$" in result["result"] or "USD" in result["result"]
+        print(f"\nTurns: {result['turns']}")
+        print(f"Result: {result['result'][:500]}")
+
+    def test_snap_eligibility(self):
+        """SNAP benefit calculation."""
+        result = _run_agent_impl(
+            "How much SNAP (food stamps) would a family of 4 with $30,000 annual income "
+            "receive in the US?",
+            api_base_url=API_BASE,
+            max_turns=10,
+        )
+        assert result["status"] == "completed"
+        assert result["result"] is not None
+        print(f"\nTurns: {result['turns']}")
+        print(f"Result: {result['result'][:500]}")
+
+
+class TestUSHouseholdComplex:
+    """Complex US household questions - may need 5-10 turns."""
+
+    def test_eitc_calculation(self):
+        """EITC with children calculation."""
+        result = _run_agent_impl(
+            "Calculate the Earned Income Tax Credit for a single parent with 2 children "
+            "earning $25,000 per year in the US.",
+            api_base_url=API_BASE,
+            max_turns=15,
+        )
+        assert result["status"] == "completed"
+        assert result["result"] is not None
+        print(f"\nTurns: {result['turns']}")
+        print(f"Result: {result['result'][:500]}")
+
+
+class TestEconomyWide:
+    """Economy-wide analysis questions - budgetary impacts, distributional analysis."""
+
+    def test_uk_policy_budgetary_impact(self):
+        """UK policy reform budgetary impact."""
+        result = _run_agent_impl(
+            "What would be the budgetary impact of raising the UK personal allowance to £15,000?",
+            api_base_url=API_BASE,
+            max_turns=20,
+        )
+        assert result["status"] == "completed"
+        assert result["result"] is not None
+        print(f"\nTurns: {result['turns']}")
+        print(f"Result: {result['result'][:500]}")
+
+    def test_us_policy_winners_losers(self):
+        """US policy reform winners and losers."""
+        result = _run_agent_impl(
+            "If the US doubled the Child Tax Credit, which income deciles would benefit most?",
+            api_base_url=API_BASE,
+            max_turns=20,
+        )
+        assert result["status"] == "completed"
+        assert result["result"] is not None
+        print(f"\nTurns: {result['turns']}")
+        print(f"Result: {result['result'][:500]}")
+
+
+class TestTurnCounting:
+    """Tests specifically to measure turn efficiency."""
+
+    @pytest.mark.parametrize(
+        "question,max_expected_turns",
+        [
+            ("What is the UK personal allowance?", 5),
+            ("What is the US standard deduction?", 5),
+            ("Calculate income tax for £30,000 UK salary", 6),
+            ("Calculate federal income tax for $50,000 US salary", 6),
+        ],
+    )
+    def test_turn_efficiency(self, question, max_expected_turns):
+        """Verify agent completes within expected turn count."""
+        result = _run_agent_impl(
+            question,
+            api_base_url=API_BASE,
+            max_turns=max_expected_turns + 5,
+        )
+        assert result["status"] == "completed"
+        print(f"\nQuestion: {question}")
+        print(f"Turns: {result['turns']} (max expected: {max_expected_turns})")
+        print(f"Result: {result['result'][:300]}")
+
+        if result["turns"] > max_expected_turns:
+            print(f"WARNING: Took {result['turns']} turns, expected <= {max_expected_turns}")