From 6b0589f7a1fbb1e191162d24e41a19907d46ac7b Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Sun, 30 Nov 2025 21:21:41 +0200 Subject: [PATCH 01/21] feat: Create data structure to represent Congressional district metadata --- .../data/congressional_districts.py | 579 ++++++++++++++++++ 1 file changed, 579 insertions(+) create mode 100644 policyengine_api/data/congressional_districts.py diff --git a/policyengine_api/data/congressional_districts.py b/policyengine_api/data/congressional_districts.py new file mode 100644 index 00000000..af8070dc --- /dev/null +++ b/policyengine_api/data/congressional_districts.py @@ -0,0 +1,579 @@ +""" +US Congressional District Metadata + +This module defines the metadata for all 435 US Congressional districts +based on the 2020 Census apportionment, effective for the 118th Congress +(2023-2025) and continuing through the 119th Congress (2025-2027). + +Source: https://ballotpedia.org/Congressional_apportionment_after_the_2020_census +Census Data: https://www.census.gov/data/tables/2020/dec/2020-apportionment-data.html +""" + +from pydantic import BaseModel, Field + + +class CongressionalDistrictMetadataItem(BaseModel): + """ + Metadata for a single US Congressional district. + + Uses Pydantic BaseModel for: + - Runtime validation of data integrity + - Automatic serialization/deserialization + - Consistency with existing codebase patterns (see policyengine_api/endpoints/economy/compare.py) + - Self-documenting schema with type hints + """ + + state_code: str = Field( + ..., + description="Two-letter US state code (uppercase)", + min_length=2, + max_length=2, + pattern="^[A-Z]{2}$" + ) + number: int = Field( + ..., + description="Congressional district number (1 for at-large districts)", + ge=1 + ) + + +# All 435 US Congressional districts based on 2020 Census apportionment +CONGRESSIONAL_DISTRICTS: list[CongressionalDistrictMetadataItem] = [ + # Alabama - 7 districts + CongressionalDistrictMetadataItem(state_code="AL", number=1), + CongressionalDistrictMetadataItem(state_code="AL", number=2), + CongressionalDistrictMetadataItem(state_code="AL", number=3), + CongressionalDistrictMetadataItem(state_code="AL", number=4), + CongressionalDistrictMetadataItem(state_code="AL", number=5), + CongressionalDistrictMetadataItem(state_code="AL", number=6), + CongressionalDistrictMetadataItem(state_code="AL", number=7), + + # Alaska - 1 at-large district + CongressionalDistrictMetadataItem(state_code="AK", number=1), + + # Arizona - 9 districts + CongressionalDistrictMetadataItem(state_code="AZ", number=1), + CongressionalDistrictMetadataItem(state_code="AZ", number=2), + CongressionalDistrictMetadataItem(state_code="AZ", number=3), + CongressionalDistrictMetadataItem(state_code="AZ", number=4), + CongressionalDistrictMetadataItem(state_code="AZ", number=5), + CongressionalDistrictMetadataItem(state_code="AZ", number=6), + CongressionalDistrictMetadataItem(state_code="AZ", number=7), + CongressionalDistrictMetadataItem(state_code="AZ", number=8), + CongressionalDistrictMetadataItem(state_code="AZ", number=9), + + # Arkansas - 4 districts + CongressionalDistrictMetadataItem(state_code="AR", number=1), + CongressionalDistrictMetadataItem(state_code="AR", number=2), + CongressionalDistrictMetadataItem(state_code="AR", number=3), + CongressionalDistrictMetadataItem(state_code="AR", number=4), + + # California - 52 districts + CongressionalDistrictMetadataItem(state_code="CA", number=1), + CongressionalDistrictMetadataItem(state_code="CA", number=2), + CongressionalDistrictMetadataItem(state_code="CA", number=3), + CongressionalDistrictMetadataItem(state_code="CA", number=4), + CongressionalDistrictMetadataItem(state_code="CA", number=5), + CongressionalDistrictMetadataItem(state_code="CA", number=6), + CongressionalDistrictMetadataItem(state_code="CA", number=7), + CongressionalDistrictMetadataItem(state_code="CA", number=8), + CongressionalDistrictMetadataItem(state_code="CA", number=9), + CongressionalDistrictMetadataItem(state_code="CA", number=10), + CongressionalDistrictMetadataItem(state_code="CA", number=11), + CongressionalDistrictMetadataItem(state_code="CA", number=12), + CongressionalDistrictMetadataItem(state_code="CA", number=13), + CongressionalDistrictMetadataItem(state_code="CA", number=14), + CongressionalDistrictMetadataItem(state_code="CA", number=15), + CongressionalDistrictMetadataItem(state_code="CA", number=16), + CongressionalDistrictMetadataItem(state_code="CA", number=17), + CongressionalDistrictMetadataItem(state_code="CA", number=18), + CongressionalDistrictMetadataItem(state_code="CA", number=19), + CongressionalDistrictMetadataItem(state_code="CA", number=20), + CongressionalDistrictMetadataItem(state_code="CA", number=21), + CongressionalDistrictMetadataItem(state_code="CA", number=22), + CongressionalDistrictMetadataItem(state_code="CA", number=23), + CongressionalDistrictMetadataItem(state_code="CA", number=24), + CongressionalDistrictMetadataItem(state_code="CA", number=25), + CongressionalDistrictMetadataItem(state_code="CA", number=26), + CongressionalDistrictMetadataItem(state_code="CA", number=27), + CongressionalDistrictMetadataItem(state_code="CA", number=28), + CongressionalDistrictMetadataItem(state_code="CA", number=29), + CongressionalDistrictMetadataItem(state_code="CA", number=30), + CongressionalDistrictMetadataItem(state_code="CA", number=31), + CongressionalDistrictMetadataItem(state_code="CA", number=32), + CongressionalDistrictMetadataItem(state_code="CA", number=33), + CongressionalDistrictMetadataItem(state_code="CA", number=34), + CongressionalDistrictMetadataItem(state_code="CA", number=35), + CongressionalDistrictMetadataItem(state_code="CA", number=36), + CongressionalDistrictMetadataItem(state_code="CA", number=37), + CongressionalDistrictMetadataItem(state_code="CA", number=38), + CongressionalDistrictMetadataItem(state_code="CA", number=39), + CongressionalDistrictMetadataItem(state_code="CA", number=40), + CongressionalDistrictMetadataItem(state_code="CA", number=41), + CongressionalDistrictMetadataItem(state_code="CA", number=42), + CongressionalDistrictMetadataItem(state_code="CA", number=43), + CongressionalDistrictMetadataItem(state_code="CA", number=44), + CongressionalDistrictMetadataItem(state_code="CA", number=45), + CongressionalDistrictMetadataItem(state_code="CA", number=46), + CongressionalDistrictMetadataItem(state_code="CA", number=47), + CongressionalDistrictMetadataItem(state_code="CA", number=48), + CongressionalDistrictMetadataItem(state_code="CA", number=49), + CongressionalDistrictMetadataItem(state_code="CA", number=50), + CongressionalDistrictMetadataItem(state_code="CA", number=51), + CongressionalDistrictMetadataItem(state_code="CA", number=52), + + # Colorado - 8 districts + CongressionalDistrictMetadataItem(state_code="CO", number=1), + CongressionalDistrictMetadataItem(state_code="CO", number=2), + CongressionalDistrictMetadataItem(state_code="CO", number=3), + CongressionalDistrictMetadataItem(state_code="CO", number=4), + CongressionalDistrictMetadataItem(state_code="CO", number=5), + CongressionalDistrictMetadataItem(state_code="CO", number=6), + CongressionalDistrictMetadataItem(state_code="CO", number=7), + CongressionalDistrictMetadataItem(state_code="CO", number=8), + + # Connecticut - 5 districts + CongressionalDistrictMetadataItem(state_code="CT", number=1), + CongressionalDistrictMetadataItem(state_code="CT", number=2), + CongressionalDistrictMetadataItem(state_code="CT", number=3), + CongressionalDistrictMetadataItem(state_code="CT", number=4), + CongressionalDistrictMetadataItem(state_code="CT", number=5), + + # Delaware - 1 at-large district + CongressionalDistrictMetadataItem(state_code="DE", number=1), + + # Florida - 28 districts + CongressionalDistrictMetadataItem(state_code="FL", number=1), + CongressionalDistrictMetadataItem(state_code="FL", number=2), + CongressionalDistrictMetadataItem(state_code="FL", number=3), + CongressionalDistrictMetadataItem(state_code="FL", number=4), + CongressionalDistrictMetadataItem(state_code="FL", number=5), + CongressionalDistrictMetadataItem(state_code="FL", number=6), + CongressionalDistrictMetadataItem(state_code="FL", number=7), + CongressionalDistrictMetadataItem(state_code="FL", number=8), + CongressionalDistrictMetadataItem(state_code="FL", number=9), + CongressionalDistrictMetadataItem(state_code="FL", number=10), + CongressionalDistrictMetadataItem(state_code="FL", number=11), + CongressionalDistrictMetadataItem(state_code="FL", number=12), + CongressionalDistrictMetadataItem(state_code="FL", number=13), + CongressionalDistrictMetadataItem(state_code="FL", number=14), + CongressionalDistrictMetadataItem(state_code="FL", number=15), + CongressionalDistrictMetadataItem(state_code="FL", number=16), + CongressionalDistrictMetadataItem(state_code="FL", number=17), + CongressionalDistrictMetadataItem(state_code="FL", number=18), + CongressionalDistrictMetadataItem(state_code="FL", number=19), + CongressionalDistrictMetadataItem(state_code="FL", number=20), + CongressionalDistrictMetadataItem(state_code="FL", number=21), + CongressionalDistrictMetadataItem(state_code="FL", number=22), + CongressionalDistrictMetadataItem(state_code="FL", number=23), + CongressionalDistrictMetadataItem(state_code="FL", number=24), + CongressionalDistrictMetadataItem(state_code="FL", number=25), + CongressionalDistrictMetadataItem(state_code="FL", number=26), + CongressionalDistrictMetadataItem(state_code="FL", number=27), + CongressionalDistrictMetadataItem(state_code="FL", number=28), + + # Georgia - 14 districts + CongressionalDistrictMetadataItem(state_code="GA", number=1), + CongressionalDistrictMetadataItem(state_code="GA", number=2), + CongressionalDistrictMetadataItem(state_code="GA", number=3), + CongressionalDistrictMetadataItem(state_code="GA", number=4), + CongressionalDistrictMetadataItem(state_code="GA", number=5), + CongressionalDistrictMetadataItem(state_code="GA", number=6), + CongressionalDistrictMetadataItem(state_code="GA", number=7), + CongressionalDistrictMetadataItem(state_code="GA", number=8), + CongressionalDistrictMetadataItem(state_code="GA", number=9), + CongressionalDistrictMetadataItem(state_code="GA", number=10), + CongressionalDistrictMetadataItem(state_code="GA", number=11), + CongressionalDistrictMetadataItem(state_code="GA", number=12), + CongressionalDistrictMetadataItem(state_code="GA", number=13), + CongressionalDistrictMetadataItem(state_code="GA", number=14), + + # Hawaii - 2 districts + CongressionalDistrictMetadataItem(state_code="HI", number=1), + CongressionalDistrictMetadataItem(state_code="HI", number=2), + + # Idaho - 2 districts + CongressionalDistrictMetadataItem(state_code="ID", number=1), + CongressionalDistrictMetadataItem(state_code="ID", number=2), + + # Illinois - 17 districts + CongressionalDistrictMetadataItem(state_code="IL", number=1), + CongressionalDistrictMetadataItem(state_code="IL", number=2), + CongressionalDistrictMetadataItem(state_code="IL", number=3), + CongressionalDistrictMetadataItem(state_code="IL", number=4), + CongressionalDistrictMetadataItem(state_code="IL", number=5), + CongressionalDistrictMetadataItem(state_code="IL", number=6), + CongressionalDistrictMetadataItem(state_code="IL", number=7), + CongressionalDistrictMetadataItem(state_code="IL", number=8), + CongressionalDistrictMetadataItem(state_code="IL", number=9), + CongressionalDistrictMetadataItem(state_code="IL", number=10), + CongressionalDistrictMetadataItem(state_code="IL", number=11), + CongressionalDistrictMetadataItem(state_code="IL", number=12), + CongressionalDistrictMetadataItem(state_code="IL", number=13), + CongressionalDistrictMetadataItem(state_code="IL", number=14), + CongressionalDistrictMetadataItem(state_code="IL", number=15), + CongressionalDistrictMetadataItem(state_code="IL", number=16), + CongressionalDistrictMetadataItem(state_code="IL", number=17), + + # Indiana - 9 districts + CongressionalDistrictMetadataItem(state_code="IN", number=1), + CongressionalDistrictMetadataItem(state_code="IN", number=2), + CongressionalDistrictMetadataItem(state_code="IN", number=3), + CongressionalDistrictMetadataItem(state_code="IN", number=4), + CongressionalDistrictMetadataItem(state_code="IN", number=5), + CongressionalDistrictMetadataItem(state_code="IN", number=6), + CongressionalDistrictMetadataItem(state_code="IN", number=7), + CongressionalDistrictMetadataItem(state_code="IN", number=8), + CongressionalDistrictMetadataItem(state_code="IN", number=9), + + # Iowa - 4 districts + CongressionalDistrictMetadataItem(state_code="IA", number=1), + CongressionalDistrictMetadataItem(state_code="IA", number=2), + CongressionalDistrictMetadataItem(state_code="IA", number=3), + CongressionalDistrictMetadataItem(state_code="IA", number=4), + + # Kansas - 4 districts + CongressionalDistrictMetadataItem(state_code="KS", number=1), + CongressionalDistrictMetadataItem(state_code="KS", number=2), + CongressionalDistrictMetadataItem(state_code="KS", number=3), + CongressionalDistrictMetadataItem(state_code="KS", number=4), + + # Kentucky - 6 districts + CongressionalDistrictMetadataItem(state_code="KY", number=1), + CongressionalDistrictMetadataItem(state_code="KY", number=2), + CongressionalDistrictMetadataItem(state_code="KY", number=3), + CongressionalDistrictMetadataItem(state_code="KY", number=4), + CongressionalDistrictMetadataItem(state_code="KY", number=5), + CongressionalDistrictMetadataItem(state_code="KY", number=6), + + # Louisiana - 6 districts + CongressionalDistrictMetadataItem(state_code="LA", number=1), + CongressionalDistrictMetadataItem(state_code="LA", number=2), + CongressionalDistrictMetadataItem(state_code="LA", number=3), + CongressionalDistrictMetadataItem(state_code="LA", number=4), + CongressionalDistrictMetadataItem(state_code="LA", number=5), + CongressionalDistrictMetadataItem(state_code="LA", number=6), + + # Maine - 2 districts + CongressionalDistrictMetadataItem(state_code="ME", number=1), + CongressionalDistrictMetadataItem(state_code="ME", number=2), + + # Maryland - 8 districts + CongressionalDistrictMetadataItem(state_code="MD", number=1), + CongressionalDistrictMetadataItem(state_code="MD", number=2), + CongressionalDistrictMetadataItem(state_code="MD", number=3), + CongressionalDistrictMetadataItem(state_code="MD", number=4), + CongressionalDistrictMetadataItem(state_code="MD", number=5), + CongressionalDistrictMetadataItem(state_code="MD", number=6), + CongressionalDistrictMetadataItem(state_code="MD", number=7), + CongressionalDistrictMetadataItem(state_code="MD", number=8), + + # Massachusetts - 9 districts + CongressionalDistrictMetadataItem(state_code="MA", number=1), + CongressionalDistrictMetadataItem(state_code="MA", number=2), + CongressionalDistrictMetadataItem(state_code="MA", number=3), + CongressionalDistrictMetadataItem(state_code="MA", number=4), + CongressionalDistrictMetadataItem(state_code="MA", number=5), + CongressionalDistrictMetadataItem(state_code="MA", number=6), + CongressionalDistrictMetadataItem(state_code="MA", number=7), + CongressionalDistrictMetadataItem(state_code="MA", number=8), + CongressionalDistrictMetadataItem(state_code="MA", number=9), + + # Michigan - 13 districts + CongressionalDistrictMetadataItem(state_code="MI", number=1), + CongressionalDistrictMetadataItem(state_code="MI", number=2), + CongressionalDistrictMetadataItem(state_code="MI", number=3), + CongressionalDistrictMetadataItem(state_code="MI", number=4), + CongressionalDistrictMetadataItem(state_code="MI", number=5), + CongressionalDistrictMetadataItem(state_code="MI", number=6), + CongressionalDistrictMetadataItem(state_code="MI", number=7), + CongressionalDistrictMetadataItem(state_code="MI", number=8), + CongressionalDistrictMetadataItem(state_code="MI", number=9), + CongressionalDistrictMetadataItem(state_code="MI", number=10), + CongressionalDistrictMetadataItem(state_code="MI", number=11), + CongressionalDistrictMetadataItem(state_code="MI", number=12), + CongressionalDistrictMetadataItem(state_code="MI", number=13), + + # Minnesota - 8 districts + CongressionalDistrictMetadataItem(state_code="MN", number=1), + CongressionalDistrictMetadataItem(state_code="MN", number=2), + CongressionalDistrictMetadataItem(state_code="MN", number=3), + CongressionalDistrictMetadataItem(state_code="MN", number=4), + CongressionalDistrictMetadataItem(state_code="MN", number=5), + CongressionalDistrictMetadataItem(state_code="MN", number=6), + CongressionalDistrictMetadataItem(state_code="MN", number=7), + CongressionalDistrictMetadataItem(state_code="MN", number=8), + + # Mississippi - 4 districts + CongressionalDistrictMetadataItem(state_code="MS", number=1), + CongressionalDistrictMetadataItem(state_code="MS", number=2), + CongressionalDistrictMetadataItem(state_code="MS", number=3), + CongressionalDistrictMetadataItem(state_code="MS", number=4), + + # Missouri - 8 districts + CongressionalDistrictMetadataItem(state_code="MO", number=1), + CongressionalDistrictMetadataItem(state_code="MO", number=2), + CongressionalDistrictMetadataItem(state_code="MO", number=3), + CongressionalDistrictMetadataItem(state_code="MO", number=4), + CongressionalDistrictMetadataItem(state_code="MO", number=5), + CongressionalDistrictMetadataItem(state_code="MO", number=6), + CongressionalDistrictMetadataItem(state_code="MO", number=7), + CongressionalDistrictMetadataItem(state_code="MO", number=8), + + # Montana - 2 districts + CongressionalDistrictMetadataItem(state_code="MT", number=1), + CongressionalDistrictMetadataItem(state_code="MT", number=2), + + # Nebraska - 3 districts + CongressionalDistrictMetadataItem(state_code="NE", number=1), + CongressionalDistrictMetadataItem(state_code="NE", number=2), + CongressionalDistrictMetadataItem(state_code="NE", number=3), + + # Nevada - 4 districts + CongressionalDistrictMetadataItem(state_code="NV", number=1), + CongressionalDistrictMetadataItem(state_code="NV", number=2), + CongressionalDistrictMetadataItem(state_code="NV", number=3), + CongressionalDistrictMetadataItem(state_code="NV", number=4), + + # New Hampshire - 2 districts + CongressionalDistrictMetadataItem(state_code="NH", number=1), + CongressionalDistrictMetadataItem(state_code="NH", number=2), + + # New Jersey - 12 districts + CongressionalDistrictMetadataItem(state_code="NJ", number=1), + CongressionalDistrictMetadataItem(state_code="NJ", number=2), + CongressionalDistrictMetadataItem(state_code="NJ", number=3), + CongressionalDistrictMetadataItem(state_code="NJ", number=4), + CongressionalDistrictMetadataItem(state_code="NJ", number=5), + CongressionalDistrictMetadataItem(state_code="NJ", number=6), + CongressionalDistrictMetadataItem(state_code="NJ", number=7), + CongressionalDistrictMetadataItem(state_code="NJ", number=8), + CongressionalDistrictMetadataItem(state_code="NJ", number=9), + CongressionalDistrictMetadataItem(state_code="NJ", number=10), + CongressionalDistrictMetadataItem(state_code="NJ", number=11), + CongressionalDistrictMetadataItem(state_code="NJ", number=12), + + # New Mexico - 3 districts + CongressionalDistrictMetadataItem(state_code="NM", number=1), + CongressionalDistrictMetadataItem(state_code="NM", number=2), + CongressionalDistrictMetadataItem(state_code="NM", number=3), + + # New York - 26 districts + CongressionalDistrictMetadataItem(state_code="NY", number=1), + CongressionalDistrictMetadataItem(state_code="NY", number=2), + CongressionalDistrictMetadataItem(state_code="NY", number=3), + CongressionalDistrictMetadataItem(state_code="NY", number=4), + CongressionalDistrictMetadataItem(state_code="NY", number=5), + CongressionalDistrictMetadataItem(state_code="NY", number=6), + CongressionalDistrictMetadataItem(state_code="NY", number=7), + CongressionalDistrictMetadataItem(state_code="NY", number=8), + CongressionalDistrictMetadataItem(state_code="NY", number=9), + CongressionalDistrictMetadataItem(state_code="NY", number=10), + CongressionalDistrictMetadataItem(state_code="NY", number=11), + CongressionalDistrictMetadataItem(state_code="NY", number=12), + CongressionalDistrictMetadataItem(state_code="NY", number=13), + CongressionalDistrictMetadataItem(state_code="NY", number=14), + CongressionalDistrictMetadataItem(state_code="NY", number=15), + CongressionalDistrictMetadataItem(state_code="NY", number=16), + CongressionalDistrictMetadataItem(state_code="NY", number=17), + CongressionalDistrictMetadataItem(state_code="NY", number=18), + CongressionalDistrictMetadataItem(state_code="NY", number=19), + CongressionalDistrictMetadataItem(state_code="NY", number=20), + CongressionalDistrictMetadataItem(state_code="NY", number=21), + CongressionalDistrictMetadataItem(state_code="NY", number=22), + CongressionalDistrictMetadataItem(state_code="NY", number=23), + CongressionalDistrictMetadataItem(state_code="NY", number=24), + CongressionalDistrictMetadataItem(state_code="NY", number=25), + CongressionalDistrictMetadataItem(state_code="NY", number=26), + + # North Carolina - 14 districts + CongressionalDistrictMetadataItem(state_code="NC", number=1), + CongressionalDistrictMetadataItem(state_code="NC", number=2), + CongressionalDistrictMetadataItem(state_code="NC", number=3), + CongressionalDistrictMetadataItem(state_code="NC", number=4), + CongressionalDistrictMetadataItem(state_code="NC", number=5), + CongressionalDistrictMetadataItem(state_code="NC", number=6), + CongressionalDistrictMetadataItem(state_code="NC", number=7), + CongressionalDistrictMetadataItem(state_code="NC", number=8), + CongressionalDistrictMetadataItem(state_code="NC", number=9), + CongressionalDistrictMetadataItem(state_code="NC", number=10), + CongressionalDistrictMetadataItem(state_code="NC", number=11), + CongressionalDistrictMetadataItem(state_code="NC", number=12), + CongressionalDistrictMetadataItem(state_code="NC", number=13), + CongressionalDistrictMetadataItem(state_code="NC", number=14), + + # North Dakota - 1 at-large district + CongressionalDistrictMetadataItem(state_code="ND", number=1), + + # Ohio - 15 districts + CongressionalDistrictMetadataItem(state_code="OH", number=1), + CongressionalDistrictMetadataItem(state_code="OH", number=2), + CongressionalDistrictMetadataItem(state_code="OH", number=3), + CongressionalDistrictMetadataItem(state_code="OH", number=4), + CongressionalDistrictMetadataItem(state_code="OH", number=5), + CongressionalDistrictMetadataItem(state_code="OH", number=6), + CongressionalDistrictMetadataItem(state_code="OH", number=7), + CongressionalDistrictMetadataItem(state_code="OH", number=8), + CongressionalDistrictMetadataItem(state_code="OH", number=9), + CongressionalDistrictMetadataItem(state_code="OH", number=10), + CongressionalDistrictMetadataItem(state_code="OH", number=11), + CongressionalDistrictMetadataItem(state_code="OH", number=12), + CongressionalDistrictMetadataItem(state_code="OH", number=13), + CongressionalDistrictMetadataItem(state_code="OH", number=14), + CongressionalDistrictMetadataItem(state_code="OH", number=15), + + # Oklahoma - 5 districts + CongressionalDistrictMetadataItem(state_code="OK", number=1), + CongressionalDistrictMetadataItem(state_code="OK", number=2), + CongressionalDistrictMetadataItem(state_code="OK", number=3), + CongressionalDistrictMetadataItem(state_code="OK", number=4), + CongressionalDistrictMetadataItem(state_code="OK", number=5), + + # Oregon - 6 districts + CongressionalDistrictMetadataItem(state_code="OR", number=1), + CongressionalDistrictMetadataItem(state_code="OR", number=2), + CongressionalDistrictMetadataItem(state_code="OR", number=3), + CongressionalDistrictMetadataItem(state_code="OR", number=4), + CongressionalDistrictMetadataItem(state_code="OR", number=5), + CongressionalDistrictMetadataItem(state_code="OR", number=6), + + # Pennsylvania - 17 districts + CongressionalDistrictMetadataItem(state_code="PA", number=1), + CongressionalDistrictMetadataItem(state_code="PA", number=2), + CongressionalDistrictMetadataItem(state_code="PA", number=3), + CongressionalDistrictMetadataItem(state_code="PA", number=4), + CongressionalDistrictMetadataItem(state_code="PA", number=5), + CongressionalDistrictMetadataItem(state_code="PA", number=6), + CongressionalDistrictMetadataItem(state_code="PA", number=7), + CongressionalDistrictMetadataItem(state_code="PA", number=8), + CongressionalDistrictMetadataItem(state_code="PA", number=9), + CongressionalDistrictMetadataItem(state_code="PA", number=10), + CongressionalDistrictMetadataItem(state_code="PA", number=11), + CongressionalDistrictMetadataItem(state_code="PA", number=12), + CongressionalDistrictMetadataItem(state_code="PA", number=13), + CongressionalDistrictMetadataItem(state_code="PA", number=14), + CongressionalDistrictMetadataItem(state_code="PA", number=15), + CongressionalDistrictMetadataItem(state_code="PA", number=16), + CongressionalDistrictMetadataItem(state_code="PA", number=17), + + # Rhode Island - 2 districts + CongressionalDistrictMetadataItem(state_code="RI", number=1), + CongressionalDistrictMetadataItem(state_code="RI", number=2), + + # South Carolina - 7 districts + CongressionalDistrictMetadataItem(state_code="SC", number=1), + CongressionalDistrictMetadataItem(state_code="SC", number=2), + CongressionalDistrictMetadataItem(state_code="SC", number=3), + CongressionalDistrictMetadataItem(state_code="SC", number=4), + CongressionalDistrictMetadataItem(state_code="SC", number=5), + CongressionalDistrictMetadataItem(state_code="SC", number=6), + CongressionalDistrictMetadataItem(state_code="SC", number=7), + + # South Dakota - 1 at-large district + CongressionalDistrictMetadataItem(state_code="SD", number=1), + + # Tennessee - 9 districts + CongressionalDistrictMetadataItem(state_code="TN", number=1), + CongressionalDistrictMetadataItem(state_code="TN", number=2), + CongressionalDistrictMetadataItem(state_code="TN", number=3), + CongressionalDistrictMetadataItem(state_code="TN", number=4), + CongressionalDistrictMetadataItem(state_code="TN", number=5), + CongressionalDistrictMetadataItem(state_code="TN", number=6), + CongressionalDistrictMetadataItem(state_code="TN", number=7), + CongressionalDistrictMetadataItem(state_code="TN", number=8), + CongressionalDistrictMetadataItem(state_code="TN", number=9), + + # Texas - 38 districts + CongressionalDistrictMetadataItem(state_code="TX", number=1), + CongressionalDistrictMetadataItem(state_code="TX", number=2), + CongressionalDistrictMetadataItem(state_code="TX", number=3), + CongressionalDistrictMetadataItem(state_code="TX", number=4), + CongressionalDistrictMetadataItem(state_code="TX", number=5), + CongressionalDistrictMetadataItem(state_code="TX", number=6), + CongressionalDistrictMetadataItem(state_code="TX", number=7), + CongressionalDistrictMetadataItem(state_code="TX", number=8), + CongressionalDistrictMetadataItem(state_code="TX", number=9), + CongressionalDistrictMetadataItem(state_code="TX", number=10), + CongressionalDistrictMetadataItem(state_code="TX", number=11), + CongressionalDistrictMetadataItem(state_code="TX", number=12), + CongressionalDistrictMetadataItem(state_code="TX", number=13), + CongressionalDistrictMetadataItem(state_code="TX", number=14), + CongressionalDistrictMetadataItem(state_code="TX", number=15), + CongressionalDistrictMetadataItem(state_code="TX", number=16), + CongressionalDistrictMetadataItem(state_code="TX", number=17), + CongressionalDistrictMetadataItem(state_code="TX", number=18), + CongressionalDistrictMetadataItem(state_code="TX", number=19), + CongressionalDistrictMetadataItem(state_code="TX", number=20), + CongressionalDistrictMetadataItem(state_code="TX", number=21), + CongressionalDistrictMetadataItem(state_code="TX", number=22), + CongressionalDistrictMetadataItem(state_code="TX", number=23), + CongressionalDistrictMetadataItem(state_code="TX", number=24), + CongressionalDistrictMetadataItem(state_code="TX", number=25), + CongressionalDistrictMetadataItem(state_code="TX", number=26), + CongressionalDistrictMetadataItem(state_code="TX", number=27), + CongressionalDistrictMetadataItem(state_code="TX", number=28), + CongressionalDistrictMetadataItem(state_code="TX", number=29), + CongressionalDistrictMetadataItem(state_code="TX", number=30), + CongressionalDistrictMetadataItem(state_code="TX", number=31), + CongressionalDistrictMetadataItem(state_code="TX", number=32), + CongressionalDistrictMetadataItem(state_code="TX", number=33), + CongressionalDistrictMetadataItem(state_code="TX", number=34), + CongressionalDistrictMetadataItem(state_code="TX", number=35), + CongressionalDistrictMetadataItem(state_code="TX", number=36), + CongressionalDistrictMetadataItem(state_code="TX", number=37), + CongressionalDistrictMetadataItem(state_code="TX", number=38), + + # Utah - 4 districts + CongressionalDistrictMetadataItem(state_code="UT", number=1), + CongressionalDistrictMetadataItem(state_code="UT", number=2), + CongressionalDistrictMetadataItem(state_code="UT", number=3), + CongressionalDistrictMetadataItem(state_code="UT", number=4), + + # Vermont - 1 at-large district + CongressionalDistrictMetadataItem(state_code="VT", number=1), + + # Virginia - 11 districts + CongressionalDistrictMetadataItem(state_code="VA", number=1), + CongressionalDistrictMetadataItem(state_code="VA", number=2), + CongressionalDistrictMetadataItem(state_code="VA", number=3), + CongressionalDistrictMetadataItem(state_code="VA", number=4), + CongressionalDistrictMetadataItem(state_code="VA", number=5), + CongressionalDistrictMetadataItem(state_code="VA", number=6), + CongressionalDistrictMetadataItem(state_code="VA", number=7), + CongressionalDistrictMetadataItem(state_code="VA", number=8), + CongressionalDistrictMetadataItem(state_code="VA", number=9), + CongressionalDistrictMetadataItem(state_code="VA", number=10), + CongressionalDistrictMetadataItem(state_code="VA", number=11), + + # Washington - 10 districts + CongressionalDistrictMetadataItem(state_code="WA", number=1), + CongressionalDistrictMetadataItem(state_code="WA", number=2), + CongressionalDistrictMetadataItem(state_code="WA", number=3), + CongressionalDistrictMetadataItem(state_code="WA", number=4), + CongressionalDistrictMetadataItem(state_code="WA", number=5), + CongressionalDistrictMetadataItem(state_code="WA", number=6), + CongressionalDistrictMetadataItem(state_code="WA", number=7), + CongressionalDistrictMetadataItem(state_code="WA", number=8), + CongressionalDistrictMetadataItem(state_code="WA", number=9), + CongressionalDistrictMetadataItem(state_code="WA", number=10), + + # West Virginia - 2 districts + CongressionalDistrictMetadataItem(state_code="WV", number=1), + CongressionalDistrictMetadataItem(state_code="WV", number=2), + + # Wisconsin - 8 districts + CongressionalDistrictMetadataItem(state_code="WI", number=1), + CongressionalDistrictMetadataItem(state_code="WI", number=2), + CongressionalDistrictMetadataItem(state_code="WI", number=3), + CongressionalDistrictMetadataItem(state_code="WI", number=4), + CongressionalDistrictMetadataItem(state_code="WI", number=5), + CongressionalDistrictMetadataItem(state_code="WI", number=6), + CongressionalDistrictMetadataItem(state_code="WI", number=7), + CongressionalDistrictMetadataItem(state_code="WI", number=8), + + # Wyoming - 1 at-large district + CongressionalDistrictMetadataItem(state_code="WY", number=1), + + # District of Columbia - 1 non-voting delegate + CongressionalDistrictMetadataItem(state_code="DC", number=1), +] From c25df3511d1546d259af4f7403e58bd8f174eef6 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Sun, 30 Nov 2025 21:35:30 +0200 Subject: [PATCH 02/21] feat: Add congressional district metadata to full metadata --- policyengine_api/country.py | 5 + .../data/congressional_districts.py | 152 +++++++++++++++++- 2 files changed, 154 insertions(+), 3 deletions(-) diff --git a/policyengine_api/country.py b/policyengine_api/country.py index af2d8ec4..e5c9b284 100644 --- a/policyengine_api/country.py +++ b/policyengine_api/country.py @@ -18,6 +18,9 @@ import math import pandas as pd from pathlib import Path +from policyengine_api.data.congressional_districts import ( + build_congressional_district_metadata, +) # Note: The following policyengine_[xx] imports are probably redundant. # These modules are imported dynamically in the __init__ function below. @@ -153,6 +156,8 @@ def build_microsimulation_options(self) -> dict: dict(name="wi", label="Wisconsin"), dict(name="wy", label="Wyoming"), ] + # Add all 436 congressional districts (435 voting + DC) + region.extend(build_congressional_district_metadata()) time_period = [ dict(name=2035, label="2035"), dict(name=2034, label="2034"), diff --git a/policyengine_api/data/congressional_districts.py b/policyengine_api/data/congressional_districts.py index af8070dc..3ab01c33 100644 --- a/policyengine_api/data/congressional_districts.py +++ b/policyengine_api/data/congressional_districts.py @@ -12,6 +12,62 @@ from pydantic import BaseModel, Field +# Mapping of state codes to full state names +STATE_CODE_TO_NAME = { + "AL": "Alabama", + "AK": "Alaska", + "AZ": "Arizona", + "AR": "Arkansas", + "CA": "California", + "CO": "Colorado", + "CT": "Connecticut", + "DE": "Delaware", + "DC": "District of Columbia", + "FL": "Florida", + "GA": "Georgia", + "HI": "Hawaii", + "ID": "Idaho", + "IL": "Illinois", + "IN": "Indiana", + "IA": "Iowa", + "KS": "Kansas", + "KY": "Kentucky", + "LA": "Louisiana", + "ME": "Maine", + "MD": "Maryland", + "MA": "Massachusetts", + "MI": "Michigan", + "MN": "Minnesota", + "MS": "Mississippi", + "MO": "Missouri", + "MT": "Montana", + "NE": "Nebraska", + "NV": "Nevada", + "NH": "New Hampshire", + "NJ": "New Jersey", + "NM": "New Mexico", + "NY": "New York", + "NC": "North Carolina", + "ND": "North Dakota", + "OH": "Ohio", + "OK": "Oklahoma", + "OR": "Oregon", + "PA": "Pennsylvania", + "RI": "Rhode Island", + "SC": "South Carolina", + "SD": "South Dakota", + "TN": "Tennessee", + "TX": "Texas", + "UT": "Utah", + "VT": "Vermont", + "VA": "Virginia", + "WA": "Washington", + "WV": "West Virginia", + "WI": "Wisconsin", + "WY": "Wyoming", +} + + class CongressionalDistrictMetadataItem(BaseModel): """ Metadata for a single US Congressional district. @@ -142,6 +198,9 @@ class CongressionalDistrictMetadataItem(BaseModel): # Delaware - 1 at-large district CongressionalDistrictMetadataItem(state_code="DE", number=1), + # District of Columbia - 1 non-voting delegate + CongressionalDistrictMetadataItem(state_code="DC", number=1), + # Florida - 28 districts CongressionalDistrictMetadataItem(state_code="FL", number=1), CongressionalDistrictMetadataItem(state_code="FL", number=2), @@ -573,7 +632,94 @@ class CongressionalDistrictMetadataItem(BaseModel): # Wyoming - 1 at-large district CongressionalDistrictMetadataItem(state_code="WY", number=1), - - # District of Columbia - 1 non-voting delegate - CongressionalDistrictMetadataItem(state_code="DC", number=1), ] + + +def _get_ordinal_suffix(number: int) -> str: + """ + Get the ordinal suffix for a number (st, nd, rd, th). + + Examples: + 1 -> "st" + 2 -> "nd" + 3 -> "rd" + 4 -> "th" + 11 -> "th" + 21 -> "st" + 22 -> "nd" + """ + if 10 <= number % 100 <= 20: + # Special case for 11th, 12th, 13th, etc. + suffix = "th" + else: + suffix = {1: "st", 2: "nd", 3: "rd"}.get(number % 10, "th") + return suffix + + +def _format_district_number(number: int) -> str: + """ + Format district number with leading zero for single digits. + + Examples: + 1 -> "01" + 9 -> "09" + 10 -> "10" + 38 -> "38" + """ + return f"{number:02d}" + + +def _build_district_name(state_code: str, number: int) -> str: + """ + Build the district name in the format: congressional_district/- + + Examples: + ("CA", 5) -> "congressional_district/CA-05" + ("TX", 38) -> "congressional_district/TX-38" + ("DC", 1) -> "congressional_district/DC-01" + """ + return f"congressional_district/{state_code}-{_format_district_number(number)}" + + +def _build_district_label(state_code: str, number: int) -> str: + """ + Build the district label in the format: 's th congressional district + + Examples: + ("CA", 1) -> "California's 1st congressional district" + ("NY", 2) -> "New York's 2nd congressional district" + ("TX", 3) -> "Texas's 3rd congressional district" + ("FL", 21) -> "Florida's 21st congressional district" + """ + state_name = STATE_CODE_TO_NAME[state_code] + ordinal_suffix = _get_ordinal_suffix(number) + return f"{state_name}'s {number}{ordinal_suffix} congressional district" + + +def build_congressional_district_metadata() -> list[dict]: + """ + Build the complete congressional district metadata structure for use in country.py. + + Returns a list of dictionaries with 'name' and 'label' keys, formatted as: + [ + { + "name": "congressional_district/CA-01", + "label": "California's 1st congressional district" + }, + { + "name": "congressional_district/CA-02", + "label": "California's 2nd congressional district" + }, + ... + ] + + Returns: + List of 436 dictionaries (435 voting districts + DC) + """ + return [ + { + "name": _build_district_name(district.state_code, district.number), + "label": _build_district_label(district.state_code, district.number), + } + for district in CONGRESSIONAL_DISTRICTS + ] From 5039738665cb6d36b92202185285abaee3e2ddaf Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Mon, 8 Dec 2025 14:17:14 +0400 Subject: [PATCH 03/21] fix: Remove _setup_data function --- policyengine_api/services/economy_service.py | 26 +------- tests/unit/services/test_economy_service.py | 68 -------------------- 2 files changed, 1 insertion(+), 93 deletions(-) diff --git a/policyengine_api/services/economy_service.py b/policyengine_api/services/economy_service.py index ae4f24be..ef661b4d 100644 --- a/policyengine_api/services/economy_service.py +++ b/policyengine_api/services/economy_service.py @@ -394,7 +394,6 @@ def _handle_create_impact( region=setup_options.region, time_period=setup_options.time_period, scope="macro", - dataset=setup_options.dataset, include_cliffs=setup_options.target == "cliff", model_version=setup_options.model_version, data_version=setup_options.data_version, @@ -430,7 +429,6 @@ def _setup_sim_options( reform_policy: Annotated[str, "String-formatted JSON"], baseline_policy: Annotated[str, "String-formatted JSON"], region: str, - dataset: str | None, time_period: str, scope: Literal["macro", "household"] = "macro", include_cliffs: bool = False, @@ -452,9 +450,7 @@ def _setup_sim_options( "region": self._setup_region( country_id=country_id, region=region ), - "data": self._setup_data( - dataset=dataset, country_id=country_id, region=region - ), + "data": None, "model_version": model_version, "data_version": data_version, } @@ -471,26 +467,6 @@ def _setup_region(self, country_id: str, region: str) -> str: return region - def _setup_data( - self, dataset: str | None, country_id: str, region: str - ) -> str | None: - """ - Take API v1 'data' string literals, which reference a dataset name, - and convert to relevant GCP filepath. In future, this should be - redone to use a more robust method of accessing datasets. - """ - - # Enhanced CPS runs must reference ECPS dataset in Google Cloud bucket - if dataset == "enhanced_cps": - return "gs://policyengine-us-data/enhanced_cps_2024.h5" - - # NYC simulations must reference pooled CPS dataset - if region == "nyc": - return "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" - - # All others (including US state-level simulations) receive no sim API 'data' arg - return None - # Note: The following methods that interface with the ReformImpactsService # are written separately because the service relies upon mutating an original # 'computing' record to 'ok' or 'error' status, rather than creating a new record. diff --git a/tests/unit/services/test_economy_service.py b/tests/unit/services/test_economy_service.py index 4f63672a..7d568b00 100644 --- a/tests/unit/services/test_economy_service.py +++ b/tests/unit/services/test_economy_service.py @@ -702,71 +702,3 @@ def test__given_non_us_state__returns_correct_region(self): result = service._setup_region(country_id, region) # Assert the expected value assert result == region - - class TestSetupData: - def test__given_enhanced_cps_dataset__returns_correct_gcp_path(self): - # Test with enhanced_cps dataset - dataset = "enhanced_cps" - country_id = "us" - region = "us" - - # Create an instance of the class - service = EconomyService() - # Call the method - result = service._setup_data(dataset, country_id, region) - # Assert the expected value - assert result == "gs://policyengine-us-data/enhanced_cps_2024.h5" - - def test__given_us_state_dataset__returns_none(self): - # Test with US state dataset - should return None - dataset = "us_state" - country_id = "us" - region = "ca" - - # Create an instance of the class - service = EconomyService() - # Call the method - result = service._setup_data(dataset, country_id, region) - # Assert the expected value - assert result is None - - def test__given_nyc_region__returns_pooled_cps(self): - # Test with NYC region - should return pooled CPS dataset - dataset = None - country_id = "us" - region = "nyc" - - # Create an instance of the class - service = EconomyService() - # Call the method - result = service._setup_data(dataset, country_id, region) - # Assert the expected value - assert ( - result == "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" - ) - - def test__given_us_nationwide_dataset__returns_none(self): - # Test with US nationwide dataset - dataset = "us_nationwide" - country_id = "us" - region = "us" - - # Create an instance of the class - service = EconomyService() - # Call the method - result = service._setup_data(dataset, country_id, region) - # Assert the expected value - assert result is None - - def test__given_uk_dataset__returns_none(self): - # Test with UK dataset - dataset = "uk_dataset" - country_id = "uk" - region = "country/england" - - # Create an instance of the class - service = EconomyService() - # Call the method - result = service._setup_data(dataset, country_id, region) - # Assert the expected value - assert result is None From 8c77a78898a41b30d73f4cd5f1cb20a5a18b07dc Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Mon, 8 Dec 2025 15:24:19 +0400 Subject: [PATCH 04/21] fix: Return default data version always --- policyengine_api/data/model_setup.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/policyengine_api/data/model_setup.py b/policyengine_api/data/model_setup.py index b734008e..a2a6a3ee 100644 --- a/policyengine_api/data/model_setup.py +++ b/policyengine_api/data/model_setup.py @@ -20,9 +20,12 @@ def get_dataset_version(country_id: str) -> str | None: """ - Get the latest dataset version for the specified country. If the country exists, but - no version is found, return None. If PolicyEngine does not publish data for the country, - raise a ValueError. + Get the dataset version for the specified country. If PolicyEngine does not + publish data for the country, raise a ValueError. + + By returning None for all valid countries, we allow policyengine.py to use + whatever default dataset version it has available, without imposing version + validation constraints from the API layer. """ match country_id: case "uk": From 59fce8d877f896980e73c794b421bc7855c5144d Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Mon, 8 Dec 2025 16:02:53 +0400 Subject: [PATCH 05/21] fix: Properly prefix districts --- policyengine_api/constants.py | 14 ++++++++++++++ policyengine_api/services/economy_service.py | 12 +++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/policyengine_api/constants.py b/policyengine_api/constants.py index 33f28466..eaaeb7a6 100644 --- a/policyengine_api/constants.py +++ b/policyengine_api/constants.py @@ -24,4 +24,18 @@ } except: COUNTRY_PACKAGE_VERSIONS = {country: "0.0.0" for country in COUNTRIES} + +# Valid region prefixes for each country +# These define the allowed geographic scope prefixes in region names +REGION_PREFIXES = { + "us": [ + "state/", # US states (e.g., "state/ca", "state/ny") + "congressional_district/", # US congressional districts (e.g., "congressional_district/CA-37") + ], + "uk": [ + "country/", # UK countries (e.g., "country/england", "country/scotland") + "constituency/", # UK parliamentary constituencies (e.g., "constituency/Aldershot") + ], +} + __version__ = VERSION diff --git a/policyengine_api/services/economy_service.py b/policyengine_api/services/economy_service.py index ef661b4d..b8336589 100644 --- a/policyengine_api/services/economy_service.py +++ b/policyengine_api/services/economy_service.py @@ -2,7 +2,7 @@ from policyengine_api.services.reform_impacts_service import ( ReformImpactsService, ) -from policyengine_api.constants import COUNTRY_PACKAGE_VERSIONS +from policyengine_api.constants import COUNTRY_PACKAGE_VERSIONS, REGION_PREFIXES from policyengine_api.gcp_logging import logger from policyengine_api.libs.simulation_api import SimulationAPI from policyengine_api.data.model_setup import get_dataset_version @@ -461,9 +461,15 @@ def _setup_region(self, country_id: str, region: str) -> str: Convert API v1 'region' option to API v2-compatible 'region' option. """ - # For US, states must be prefixed with 'state/' + # For US regions (excluding the national-level "us") if country_id == "us" and region != "us": - return "state/" + region + # Check if region already has a valid prefix + valid_prefixes = REGION_PREFIXES.get(country_id, []) + has_valid_prefix = any(region.startswith(prefix) for prefix in valid_prefixes) + + if not has_valid_prefix: + # Legacy format: bare region codes (e.g., "tx") need "state/" prefix + return "state/" + region return region From db69a02d6054c267999cd69634cb39e294349b29 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 9 Dec 2025 00:45:19 +0400 Subject: [PATCH 06/21] fix: Add region validation for US --- .../data/congressional_districts.py | 25 ++++++++++++++ policyengine_api/services/economy_service.py | 33 ++++++++++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/policyengine_api/data/congressional_districts.py b/policyengine_api/data/congressional_districts.py index 3ab01c33..73488833 100644 --- a/policyengine_api/data/congressional_districts.py +++ b/policyengine_api/data/congressional_districts.py @@ -723,3 +723,28 @@ def build_congressional_district_metadata() -> list[dict]: } for district in CONGRESSIONAL_DISTRICTS ] + + +def get_valid_state_codes() -> set[str]: + """ + Get the set of valid US state codes (lowercase for case-insensitive matching). + + Returns: + Set of 51 lowercase state codes (50 states + DC) + """ + return {code.lower() for code in STATE_CODE_TO_NAME.keys()} + + +def get_valid_congressional_districts() -> set[str]: + """ + Get the set of valid congressional district identifiers (lowercase for case-insensitive matching). + + Format: "-" (e.g., "ca-37", "tx-01") + + Returns: + Set of 436 lowercase district identifiers + """ + return { + f"{district.state_code.lower()}-{_format_district_number(district.number)}" + for district in CONGRESSIONAL_DISTRICTS + } diff --git a/policyengine_api/services/economy_service.py b/policyengine_api/services/economy_service.py index b8336589..785ea982 100644 --- a/policyengine_api/services/economy_service.py +++ b/policyengine_api/services/economy_service.py @@ -6,6 +6,10 @@ from policyengine_api.gcp_logging import logger from policyengine_api.libs.simulation_api import SimulationAPI from policyengine_api.data.model_setup import get_dataset_version +from policyengine_api.data.congressional_districts import ( + get_valid_state_codes, + get_valid_congressional_districts, +) from policyengine.simulation import SimulationOptions from google.cloud.workflows import executions_v1 import json @@ -459,6 +463,9 @@ def _setup_sim_options( def _setup_region(self, country_id: str, region: str) -> str: """ Convert API v1 'region' option to API v2-compatible 'region' option. + + Validates that the region is a known valid region for the country. + Raises ValueError for invalid regions. """ # For US regions (excluding the national-level "us") @@ -467,12 +474,36 @@ def _setup_region(self, country_id: str, region: str) -> str: valid_prefixes = REGION_PREFIXES.get(country_id, []) has_valid_prefix = any(region.startswith(prefix) for prefix in valid_prefixes) - if not has_valid_prefix: + if has_valid_prefix: + # Validate the region value after the prefix + self._validate_us_region(region) + return region + else: # Legacy format: bare region codes (e.g., "tx") need "state/" prefix + # Validate it's a real state code before adding prefix + if region.lower() not in get_valid_state_codes(): + raise ValueError(f"Invalid US region: '{region}'") return "state/" + region return region + def _validate_us_region(self, region: str) -> None: + """ + Validate a prefixed US region string. + + Raises ValueError if the region is not valid. + """ + if region.startswith("state/"): + state_code = region[len("state/"):] + if state_code.lower() not in get_valid_state_codes(): + raise ValueError(f"Invalid US state: '{state_code}'") + elif region.startswith("congressional_district/"): + district_id = region[len("congressional_district/"):] + if district_id.lower() not in get_valid_congressional_districts(): + raise ValueError( + f"Invalid congressional district: '{district_id}'" + ) + # Note: The following methods that interface with the ReformImpactsService # are written separately because the service relies upon mutating an original # 'computing' record to 'ok' or 'error' status, rather than creating a new record. From 8a5749b6593d12890f81810a0abdcdbeb68adfe3 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 9 Dec 2025 00:47:28 +0400 Subject: [PATCH 07/21] chore: Lint --- changelog_entry.yaml | 6 ++ policyengine_api/constants.py | 8 +-- .../data/congressional_districts.py | 58 ++----------------- policyengine_api/services/economy_service.py | 13 +++-- 4 files changed, 24 insertions(+), 61 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..7a5b8fac 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,6 @@ +- bump: patch + changes: + added: + - US congressional district metadata + changed: + - US simulations use default datasets from .py diff --git a/policyengine_api/constants.py b/policyengine_api/constants.py index eaaeb7a6..17b7e3fb 100644 --- a/policyengine_api/constants.py +++ b/policyengine_api/constants.py @@ -29,12 +29,12 @@ # These define the allowed geographic scope prefixes in region names REGION_PREFIXES = { "us": [ - "state/", # US states (e.g., "state/ca", "state/ny") - "congressional_district/", # US congressional districts (e.g., "congressional_district/CA-37") + "state/", # US states (e.g., "state/ca", "state/ny") + "congressional_district/", # US congressional districts (e.g., "congressional_district/CA-37") ], "uk": [ - "country/", # UK countries (e.g., "country/england", "country/scotland") - "constituency/", # UK parliamentary constituencies (e.g., "constituency/Aldershot") + "country/", # UK countries (e.g., "country/england", "country/scotland") + "constituency/", # UK parliamentary constituencies (e.g., "constituency/Aldershot") ], } diff --git a/policyengine_api/data/congressional_districts.py b/policyengine_api/data/congressional_districts.py index 73488833..af17fb31 100644 --- a/policyengine_api/data/congressional_districts.py +++ b/policyengine_api/data/congressional_districts.py @@ -84,12 +84,12 @@ class CongressionalDistrictMetadataItem(BaseModel): description="Two-letter US state code (uppercase)", min_length=2, max_length=2, - pattern="^[A-Z]{2}$" + pattern="^[A-Z]{2}$", ) number: int = Field( ..., description="Congressional district number (1 for at-large districts)", - ge=1 + ge=1, ) @@ -103,10 +103,8 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="AL", number=5), CongressionalDistrictMetadataItem(state_code="AL", number=6), CongressionalDistrictMetadataItem(state_code="AL", number=7), - # Alaska - 1 at-large district CongressionalDistrictMetadataItem(state_code="AK", number=1), - # Arizona - 9 districts CongressionalDistrictMetadataItem(state_code="AZ", number=1), CongressionalDistrictMetadataItem(state_code="AZ", number=2), @@ -117,13 +115,11 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="AZ", number=7), CongressionalDistrictMetadataItem(state_code="AZ", number=8), CongressionalDistrictMetadataItem(state_code="AZ", number=9), - # Arkansas - 4 districts CongressionalDistrictMetadataItem(state_code="AR", number=1), CongressionalDistrictMetadataItem(state_code="AR", number=2), CongressionalDistrictMetadataItem(state_code="AR", number=3), CongressionalDistrictMetadataItem(state_code="AR", number=4), - # California - 52 districts CongressionalDistrictMetadataItem(state_code="CA", number=1), CongressionalDistrictMetadataItem(state_code="CA", number=2), @@ -177,7 +173,6 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="CA", number=50), CongressionalDistrictMetadataItem(state_code="CA", number=51), CongressionalDistrictMetadataItem(state_code="CA", number=52), - # Colorado - 8 districts CongressionalDistrictMetadataItem(state_code="CO", number=1), CongressionalDistrictMetadataItem(state_code="CO", number=2), @@ -187,20 +182,16 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="CO", number=6), CongressionalDistrictMetadataItem(state_code="CO", number=7), CongressionalDistrictMetadataItem(state_code="CO", number=8), - # Connecticut - 5 districts CongressionalDistrictMetadataItem(state_code="CT", number=1), CongressionalDistrictMetadataItem(state_code="CT", number=2), CongressionalDistrictMetadataItem(state_code="CT", number=3), CongressionalDistrictMetadataItem(state_code="CT", number=4), CongressionalDistrictMetadataItem(state_code="CT", number=5), - # Delaware - 1 at-large district CongressionalDistrictMetadataItem(state_code="DE", number=1), - # District of Columbia - 1 non-voting delegate CongressionalDistrictMetadataItem(state_code="DC", number=1), - # Florida - 28 districts CongressionalDistrictMetadataItem(state_code="FL", number=1), CongressionalDistrictMetadataItem(state_code="FL", number=2), @@ -230,7 +221,6 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="FL", number=26), CongressionalDistrictMetadataItem(state_code="FL", number=27), CongressionalDistrictMetadataItem(state_code="FL", number=28), - # Georgia - 14 districts CongressionalDistrictMetadataItem(state_code="GA", number=1), CongressionalDistrictMetadataItem(state_code="GA", number=2), @@ -246,15 +236,12 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="GA", number=12), CongressionalDistrictMetadataItem(state_code="GA", number=13), CongressionalDistrictMetadataItem(state_code="GA", number=14), - # Hawaii - 2 districts CongressionalDistrictMetadataItem(state_code="HI", number=1), CongressionalDistrictMetadataItem(state_code="HI", number=2), - # Idaho - 2 districts CongressionalDistrictMetadataItem(state_code="ID", number=1), CongressionalDistrictMetadataItem(state_code="ID", number=2), - # Illinois - 17 districts CongressionalDistrictMetadataItem(state_code="IL", number=1), CongressionalDistrictMetadataItem(state_code="IL", number=2), @@ -273,7 +260,6 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="IL", number=15), CongressionalDistrictMetadataItem(state_code="IL", number=16), CongressionalDistrictMetadataItem(state_code="IL", number=17), - # Indiana - 9 districts CongressionalDistrictMetadataItem(state_code="IN", number=1), CongressionalDistrictMetadataItem(state_code="IN", number=2), @@ -284,19 +270,16 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="IN", number=7), CongressionalDistrictMetadataItem(state_code="IN", number=8), CongressionalDistrictMetadataItem(state_code="IN", number=9), - # Iowa - 4 districts CongressionalDistrictMetadataItem(state_code="IA", number=1), CongressionalDistrictMetadataItem(state_code="IA", number=2), CongressionalDistrictMetadataItem(state_code="IA", number=3), CongressionalDistrictMetadataItem(state_code="IA", number=4), - # Kansas - 4 districts CongressionalDistrictMetadataItem(state_code="KS", number=1), CongressionalDistrictMetadataItem(state_code="KS", number=2), CongressionalDistrictMetadataItem(state_code="KS", number=3), CongressionalDistrictMetadataItem(state_code="KS", number=4), - # Kentucky - 6 districts CongressionalDistrictMetadataItem(state_code="KY", number=1), CongressionalDistrictMetadataItem(state_code="KY", number=2), @@ -304,7 +287,6 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="KY", number=4), CongressionalDistrictMetadataItem(state_code="KY", number=5), CongressionalDistrictMetadataItem(state_code="KY", number=6), - # Louisiana - 6 districts CongressionalDistrictMetadataItem(state_code="LA", number=1), CongressionalDistrictMetadataItem(state_code="LA", number=2), @@ -312,11 +294,9 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="LA", number=4), CongressionalDistrictMetadataItem(state_code="LA", number=5), CongressionalDistrictMetadataItem(state_code="LA", number=6), - # Maine - 2 districts CongressionalDistrictMetadataItem(state_code="ME", number=1), CongressionalDistrictMetadataItem(state_code="ME", number=2), - # Maryland - 8 districts CongressionalDistrictMetadataItem(state_code="MD", number=1), CongressionalDistrictMetadataItem(state_code="MD", number=2), @@ -326,7 +306,6 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="MD", number=6), CongressionalDistrictMetadataItem(state_code="MD", number=7), CongressionalDistrictMetadataItem(state_code="MD", number=8), - # Massachusetts - 9 districts CongressionalDistrictMetadataItem(state_code="MA", number=1), CongressionalDistrictMetadataItem(state_code="MA", number=2), @@ -337,7 +316,6 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="MA", number=7), CongressionalDistrictMetadataItem(state_code="MA", number=8), CongressionalDistrictMetadataItem(state_code="MA", number=9), - # Michigan - 13 districts CongressionalDistrictMetadataItem(state_code="MI", number=1), CongressionalDistrictMetadataItem(state_code="MI", number=2), @@ -352,7 +330,6 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="MI", number=11), CongressionalDistrictMetadataItem(state_code="MI", number=12), CongressionalDistrictMetadataItem(state_code="MI", number=13), - # Minnesota - 8 districts CongressionalDistrictMetadataItem(state_code="MN", number=1), CongressionalDistrictMetadataItem(state_code="MN", number=2), @@ -362,13 +339,11 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="MN", number=6), CongressionalDistrictMetadataItem(state_code="MN", number=7), CongressionalDistrictMetadataItem(state_code="MN", number=8), - # Mississippi - 4 districts CongressionalDistrictMetadataItem(state_code="MS", number=1), CongressionalDistrictMetadataItem(state_code="MS", number=2), CongressionalDistrictMetadataItem(state_code="MS", number=3), CongressionalDistrictMetadataItem(state_code="MS", number=4), - # Missouri - 8 districts CongressionalDistrictMetadataItem(state_code="MO", number=1), CongressionalDistrictMetadataItem(state_code="MO", number=2), @@ -378,26 +353,21 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="MO", number=6), CongressionalDistrictMetadataItem(state_code="MO", number=7), CongressionalDistrictMetadataItem(state_code="MO", number=8), - # Montana - 2 districts CongressionalDistrictMetadataItem(state_code="MT", number=1), CongressionalDistrictMetadataItem(state_code="MT", number=2), - # Nebraska - 3 districts CongressionalDistrictMetadataItem(state_code="NE", number=1), CongressionalDistrictMetadataItem(state_code="NE", number=2), CongressionalDistrictMetadataItem(state_code="NE", number=3), - # Nevada - 4 districts CongressionalDistrictMetadataItem(state_code="NV", number=1), CongressionalDistrictMetadataItem(state_code="NV", number=2), CongressionalDistrictMetadataItem(state_code="NV", number=3), CongressionalDistrictMetadataItem(state_code="NV", number=4), - # New Hampshire - 2 districts CongressionalDistrictMetadataItem(state_code="NH", number=1), CongressionalDistrictMetadataItem(state_code="NH", number=2), - # New Jersey - 12 districts CongressionalDistrictMetadataItem(state_code="NJ", number=1), CongressionalDistrictMetadataItem(state_code="NJ", number=2), @@ -411,12 +381,10 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="NJ", number=10), CongressionalDistrictMetadataItem(state_code="NJ", number=11), CongressionalDistrictMetadataItem(state_code="NJ", number=12), - # New Mexico - 3 districts CongressionalDistrictMetadataItem(state_code="NM", number=1), CongressionalDistrictMetadataItem(state_code="NM", number=2), CongressionalDistrictMetadataItem(state_code="NM", number=3), - # New York - 26 districts CongressionalDistrictMetadataItem(state_code="NY", number=1), CongressionalDistrictMetadataItem(state_code="NY", number=2), @@ -444,7 +412,6 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="NY", number=24), CongressionalDistrictMetadataItem(state_code="NY", number=25), CongressionalDistrictMetadataItem(state_code="NY", number=26), - # North Carolina - 14 districts CongressionalDistrictMetadataItem(state_code="NC", number=1), CongressionalDistrictMetadataItem(state_code="NC", number=2), @@ -460,10 +427,8 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="NC", number=12), CongressionalDistrictMetadataItem(state_code="NC", number=13), CongressionalDistrictMetadataItem(state_code="NC", number=14), - # North Dakota - 1 at-large district CongressionalDistrictMetadataItem(state_code="ND", number=1), - # Ohio - 15 districts CongressionalDistrictMetadataItem(state_code="OH", number=1), CongressionalDistrictMetadataItem(state_code="OH", number=2), @@ -480,14 +445,12 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="OH", number=13), CongressionalDistrictMetadataItem(state_code="OH", number=14), CongressionalDistrictMetadataItem(state_code="OH", number=15), - # Oklahoma - 5 districts CongressionalDistrictMetadataItem(state_code="OK", number=1), CongressionalDistrictMetadataItem(state_code="OK", number=2), CongressionalDistrictMetadataItem(state_code="OK", number=3), CongressionalDistrictMetadataItem(state_code="OK", number=4), CongressionalDistrictMetadataItem(state_code="OK", number=5), - # Oregon - 6 districts CongressionalDistrictMetadataItem(state_code="OR", number=1), CongressionalDistrictMetadataItem(state_code="OR", number=2), @@ -495,7 +458,6 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="OR", number=4), CongressionalDistrictMetadataItem(state_code="OR", number=5), CongressionalDistrictMetadataItem(state_code="OR", number=6), - # Pennsylvania - 17 districts CongressionalDistrictMetadataItem(state_code="PA", number=1), CongressionalDistrictMetadataItem(state_code="PA", number=2), @@ -514,11 +476,9 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="PA", number=15), CongressionalDistrictMetadataItem(state_code="PA", number=16), CongressionalDistrictMetadataItem(state_code="PA", number=17), - # Rhode Island - 2 districts CongressionalDistrictMetadataItem(state_code="RI", number=1), CongressionalDistrictMetadataItem(state_code="RI", number=2), - # South Carolina - 7 districts CongressionalDistrictMetadataItem(state_code="SC", number=1), CongressionalDistrictMetadataItem(state_code="SC", number=2), @@ -527,10 +487,8 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="SC", number=5), CongressionalDistrictMetadataItem(state_code="SC", number=6), CongressionalDistrictMetadataItem(state_code="SC", number=7), - # South Dakota - 1 at-large district CongressionalDistrictMetadataItem(state_code="SD", number=1), - # Tennessee - 9 districts CongressionalDistrictMetadataItem(state_code="TN", number=1), CongressionalDistrictMetadataItem(state_code="TN", number=2), @@ -541,7 +499,6 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="TN", number=7), CongressionalDistrictMetadataItem(state_code="TN", number=8), CongressionalDistrictMetadataItem(state_code="TN", number=9), - # Texas - 38 districts CongressionalDistrictMetadataItem(state_code="TX", number=1), CongressionalDistrictMetadataItem(state_code="TX", number=2), @@ -581,16 +538,13 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="TX", number=36), CongressionalDistrictMetadataItem(state_code="TX", number=37), CongressionalDistrictMetadataItem(state_code="TX", number=38), - # Utah - 4 districts CongressionalDistrictMetadataItem(state_code="UT", number=1), CongressionalDistrictMetadataItem(state_code="UT", number=2), CongressionalDistrictMetadataItem(state_code="UT", number=3), CongressionalDistrictMetadataItem(state_code="UT", number=4), - # Vermont - 1 at-large district CongressionalDistrictMetadataItem(state_code="VT", number=1), - # Virginia - 11 districts CongressionalDistrictMetadataItem(state_code="VA", number=1), CongressionalDistrictMetadataItem(state_code="VA", number=2), @@ -603,7 +557,6 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="VA", number=9), CongressionalDistrictMetadataItem(state_code="VA", number=10), CongressionalDistrictMetadataItem(state_code="VA", number=11), - # Washington - 10 districts CongressionalDistrictMetadataItem(state_code="WA", number=1), CongressionalDistrictMetadataItem(state_code="WA", number=2), @@ -615,11 +568,9 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="WA", number=8), CongressionalDistrictMetadataItem(state_code="WA", number=9), CongressionalDistrictMetadataItem(state_code="WA", number=10), - # West Virginia - 2 districts CongressionalDistrictMetadataItem(state_code="WV", number=1), CongressionalDistrictMetadataItem(state_code="WV", number=2), - # Wisconsin - 8 districts CongressionalDistrictMetadataItem(state_code="WI", number=1), CongressionalDistrictMetadataItem(state_code="WI", number=2), @@ -629,7 +580,6 @@ class CongressionalDistrictMetadataItem(BaseModel): CongressionalDistrictMetadataItem(state_code="WI", number=6), CongressionalDistrictMetadataItem(state_code="WI", number=7), CongressionalDistrictMetadataItem(state_code="WI", number=8), - # Wyoming - 1 at-large district CongressionalDistrictMetadataItem(state_code="WY", number=1), ] @@ -719,7 +669,9 @@ def build_congressional_district_metadata() -> list[dict]: return [ { "name": _build_district_name(district.state_code, district.number), - "label": _build_district_label(district.state_code, district.number), + "label": _build_district_label( + district.state_code, district.number + ), } for district in CONGRESSIONAL_DISTRICTS ] diff --git a/policyengine_api/services/economy_service.py b/policyengine_api/services/economy_service.py index 785ea982..af8f5f0f 100644 --- a/policyengine_api/services/economy_service.py +++ b/policyengine_api/services/economy_service.py @@ -2,7 +2,10 @@ from policyengine_api.services.reform_impacts_service import ( ReformImpactsService, ) -from policyengine_api.constants import COUNTRY_PACKAGE_VERSIONS, REGION_PREFIXES +from policyengine_api.constants import ( + COUNTRY_PACKAGE_VERSIONS, + REGION_PREFIXES, +) from policyengine_api.gcp_logging import logger from policyengine_api.libs.simulation_api import SimulationAPI from policyengine_api.data.model_setup import get_dataset_version @@ -472,7 +475,9 @@ def _setup_region(self, country_id: str, region: str) -> str: if country_id == "us" and region != "us": # Check if region already has a valid prefix valid_prefixes = REGION_PREFIXES.get(country_id, []) - has_valid_prefix = any(region.startswith(prefix) for prefix in valid_prefixes) + has_valid_prefix = any( + region.startswith(prefix) for prefix in valid_prefixes + ) if has_valid_prefix: # Validate the region value after the prefix @@ -494,11 +499,11 @@ def _validate_us_region(self, region: str) -> None: Raises ValueError if the region is not valid. """ if region.startswith("state/"): - state_code = region[len("state/"):] + state_code = region[len("state/") :] if state_code.lower() not in get_valid_state_codes(): raise ValueError(f"Invalid US state: '{state_code}'") elif region.startswith("congressional_district/"): - district_id = region[len("congressional_district/"):] + district_id = region[len("congressional_district/") :] if district_id.lower() not in get_valid_congressional_districts(): raise ValueError( f"Invalid congressional district: '{district_id}'" From 426476ca480eed4c98ce987158b6cb40a81271a7 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 9 Dec 2025 12:47:07 +0400 Subject: [PATCH 08/21] test: Update tests to remove unused dataset param --- tests/unit/services/test_economy_service.py | 28 ++++++--------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/tests/unit/services/test_economy_service.py b/tests/unit/services/test_economy_service.py index 7d568b00..45f5bf79 100644 --- a/tests/unit/services/test_economy_service.py +++ b/tests/unit/services/test_economy_service.py @@ -528,7 +528,6 @@ class TestSetupSimOptions: ) test_current_law_baseline_policy = json.dumps({}) test_region = "us" - test_dataset = None test_time_period = 2025 test_scope: Literal["macro"] = "macro" @@ -537,13 +536,12 @@ def test__given_valid_options__returns_correct_sim_options(self): # Create an instance of the class service = EconomyService() - # Call the method with the test data; patch setup_region and setup_data methods + # Call the method with the test data sim_options_model = service._setup_sim_options( self.test_country_id, self.test_reform_policy, self.test_current_law_baseline_policy, self.test_region, - self.test_dataset, self.test_time_period, self.test_scope, ) @@ -559,7 +557,7 @@ def test__given_valid_options__returns_correct_sim_options(self): ) assert sim_options["time_period"] == self.test_time_period assert sim_options["region"] == "us" - assert sim_options["data"] == None + assert sim_options["data"] is None def test__given_us_state__returns_correct_sim_options(self): # Test with a US state @@ -569,7 +567,6 @@ def test__given_us_state__returns_correct_sim_options(self): ) current_law_baseline_policy = json.dumps({}) region = "ca" - dataset = None time_period = 2025 scope = "macro" @@ -581,7 +578,6 @@ def test__given_us_state__returns_correct_sim_options(self): reform_policy, current_law_baseline_policy, region, - dataset, time_period, scope, ) @@ -597,15 +593,14 @@ def test__given_us_state__returns_correct_sim_options(self): assert sim_options["region"] == "state/ca" assert sim_options["data"] is None - def test__given_enhanced_cps_state__returns_correct_sim_options(self): - # Test with enhanced_cps dataset + def test__given_us_state_utah__returns_correct_sim_options(self): + # Test with Utah state country_id = "us" reform_policy = json.dumps( {"sample_param": {"2024-01-01.2100-12-31": 15}} ) current_law_baseline_policy = json.dumps({}) region = "ut" - dataset = "enhanced_cps" time_period = 2025 scope = "macro" @@ -617,7 +612,6 @@ def test__given_enhanced_cps_state__returns_correct_sim_options(self): reform_policy, current_law_baseline_policy, region, - dataset, time_period, scope, ) @@ -631,10 +625,7 @@ def test__given_enhanced_cps_state__returns_correct_sim_options(self): ) assert sim_options["time_period"] == time_period assert sim_options["region"] == "state/ut" - assert ( - sim_options["data"] - == "gs://policyengine-us-data/enhanced_cps_2024.h5" - ) + assert sim_options["data"] is None def test__given_cliff_target__returns_correct_sim_options(self): country_id = "us" @@ -643,10 +634,8 @@ def test__given_cliff_target__returns_correct_sim_options(self): ) current_law_baseline_policy = json.dumps({}) region = "us" - dataset = None time_period = 2025 scope = "macro" - target = "cliff" # Create an instance of the class service = EconomyService() @@ -657,10 +646,9 @@ def test__given_cliff_target__returns_correct_sim_options(self): reform_policy, current_law_baseline_policy, region, - dataset, time_period, scope, - include_cliffs=target == "cliff", + include_cliffs=True, ) # Assert the expected values in the returned dictionary @@ -673,8 +661,8 @@ def test__given_cliff_target__returns_correct_sim_options(self): ) assert sim_options["time_period"] == time_period assert sim_options["region"] == region - assert sim_options["data"] == None - assert sim_options["include_cliffs"] == True + assert sim_options["data"] is None + assert sim_options["include_cliffs"] is True class TestSetupRegion: def test__given_us_state__returns_correct_region(self): From cabb5d997bbc7cf04b1ecb7b6c19716d54a1fb0a Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 9 Dec 2025 12:57:23 +0400 Subject: [PATCH 09/21] test: Add tests --- tests/unit/data/__init__.py | 0 .../unit/data/test_congressional_districts.py | 245 ++++++++++++++++++ tests/unit/data/test_model_setup.py | 31 +++ tests/unit/services/test_economy_service.py | 90 +++++++ 4 files changed, 366 insertions(+) create mode 100644 tests/unit/data/__init__.py create mode 100644 tests/unit/data/test_congressional_districts.py create mode 100644 tests/unit/data/test_model_setup.py diff --git a/tests/unit/data/__init__.py b/tests/unit/data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/data/test_congressional_districts.py b/tests/unit/data/test_congressional_districts.py new file mode 100644 index 00000000..02fb1e13 --- /dev/null +++ b/tests/unit/data/test_congressional_districts.py @@ -0,0 +1,245 @@ +import pytest +from pydantic import ValidationError + +from policyengine_api.data.congressional_districts import ( + CongressionalDistrictMetadataItem, + CONGRESSIONAL_DISTRICTS, + STATE_CODE_TO_NAME, + build_congressional_district_metadata, + get_valid_state_codes, + get_valid_congressional_districts, +) + + +class TestCongressionalDistrictMetadataItem: + """Tests for the CongressionalDistrictMetadataItem Pydantic model.""" + + def test__given_valid_state_code_and_number__creates_item(self): + item = CongressionalDistrictMetadataItem(state_code="CA", number=37) + assert item.state_code == "CA" + assert item.number == 37 + + def test__given_lowercase_state_code__raises_validation_error(self): + with pytest.raises(ValidationError): + CongressionalDistrictMetadataItem(state_code="ca", number=1) + + def test__given_single_letter_state_code__raises_validation_error(self): + with pytest.raises(ValidationError): + CongressionalDistrictMetadataItem(state_code="C", number=1) + + def test__given_three_letter_state_code__raises_validation_error(self): + with pytest.raises(ValidationError): + CongressionalDistrictMetadataItem(state_code="CAL", number=1) + + def test__given_zero_district_number__raises_validation_error(self): + with pytest.raises(ValidationError): + CongressionalDistrictMetadataItem(state_code="CA", number=0) + + def test__given_negative_district_number__raises_validation_error(self): + with pytest.raises(ValidationError): + CongressionalDistrictMetadataItem(state_code="CA", number=-1) + + +class TestStateCodeToName: + """Tests for the STATE_CODE_TO_NAME mapping.""" + + def test__contains_50_states_plus_dc(self): + assert len(STATE_CODE_TO_NAME) == 51 + + def test__contains_all_state_codes_uppercase(self): + for code in STATE_CODE_TO_NAME.keys(): + assert code == code.upper() + assert len(code) == 2 + + def test__contains_dc(self): + assert "DC" in STATE_CODE_TO_NAME + assert STATE_CODE_TO_NAME["DC"] == "District of Columbia" + + def test__contains_california(self): + assert "CA" in STATE_CODE_TO_NAME + assert STATE_CODE_TO_NAME["CA"] == "California" + + +class TestCongressionalDistricts: + """Tests for the CONGRESSIONAL_DISTRICTS list.""" + + def test__contains_436_districts(self): + # 435 voting districts + 1 DC non-voting + assert len(CONGRESSIONAL_DISTRICTS) == 436 + + def test__all_items_are_valid_metadata_items(self): + for district in CONGRESSIONAL_DISTRICTS: + assert isinstance(district, CongressionalDistrictMetadataItem) + + def test__all_state_codes_are_in_state_code_to_name(self): + for district in CONGRESSIONAL_DISTRICTS: + assert district.state_code in STATE_CODE_TO_NAME + + def test__california_has_52_districts(self): + ca_districts = [d for d in CONGRESSIONAL_DISTRICTS if d.state_code == "CA"] + assert len(ca_districts) == 52 + + def test__texas_has_38_districts(self): + tx_districts = [d for d in CONGRESSIONAL_DISTRICTS if d.state_code == "TX"] + assert len(tx_districts) == 38 + + def test__at_large_states_have_1_district(self): + # States with only 1 at-large representative + at_large_states = ["AK", "DE", "ND", "SD", "VT", "WY"] + for state_code in at_large_states: + state_districts = [ + d for d in CONGRESSIONAL_DISTRICTS if d.state_code == state_code + ] + assert len(state_districts) == 1 + assert state_districts[0].number == 1 + + def test__dc_has_1_district(self): + dc_districts = [d for d in CONGRESSIONAL_DISTRICTS if d.state_code == "DC"] + assert len(dc_districts) == 1 + assert dc_districts[0].number == 1 + + def test__dc_comes_after_delaware(self): + # Find indices + de_indices = [ + i for i, d in enumerate(CONGRESSIONAL_DISTRICTS) if d.state_code == "DE" + ] + dc_indices = [ + i for i, d in enumerate(CONGRESSIONAL_DISTRICTS) if d.state_code == "DC" + ] + # DC should come after all DE districts + assert min(dc_indices) > max(de_indices) + + +class TestBuildCongressionalDistrictMetadata: + """Tests for the build_congressional_district_metadata function.""" + + def test__returns_list_of_436_items(self): + metadata = build_congressional_district_metadata() + assert len(metadata) == 436 + + def test__each_item_has_name_and_label_keys(self): + metadata = build_congressional_district_metadata() + for item in metadata: + assert "name" in item + assert "label" in item + + def test__name_has_correct_format(self): + metadata = build_congressional_district_metadata() + # Check first California district + ca_01 = next( + item for item in metadata if item["name"] == "congressional_district/CA-01" + ) + assert ca_01 is not None + + def test__label_has_correct_format(self): + metadata = build_congressional_district_metadata() + ca_01 = next( + item for item in metadata if item["name"] == "congressional_district/CA-01" + ) + assert ca_01["label"] == "California's 1st congressional district" + + def test__ordinal_suffixes_are_correct(self): + metadata = build_congressional_district_metadata() + + # Find specific districts to test ordinal suffixes + ca_01 = next( + item for item in metadata if item["name"] == "congressional_district/CA-01" + ) + ca_02 = next( + item for item in metadata if item["name"] == "congressional_district/CA-02" + ) + ca_03 = next( + item for item in metadata if item["name"] == "congressional_district/CA-03" + ) + ca_11 = next( + item for item in metadata if item["name"] == "congressional_district/CA-11" + ) + ca_12 = next( + item for item in metadata if item["name"] == "congressional_district/CA-12" + ) + ca_21 = next( + item for item in metadata if item["name"] == "congressional_district/CA-21" + ) + ca_22 = next( + item for item in metadata if item["name"] == "congressional_district/CA-22" + ) + + assert "1st" in ca_01["label"] + assert "2nd" in ca_02["label"] + assert "3rd" in ca_03["label"] + assert "11th" in ca_11["label"] + assert "12th" in ca_12["label"] + assert "21st" in ca_21["label"] + assert "22nd" in ca_22["label"] + + def test__district_numbers_have_leading_zeros(self): + metadata = build_congressional_district_metadata() + # Single digit districts should have leading zero + ca_01 = next( + item for item in metadata if item["name"] == "congressional_district/CA-01" + ) + assert ca_01["name"] == "congressional_district/CA-01" + + # Double digit districts should not have leading zero + ca_37 = next( + item for item in metadata if item["name"] == "congressional_district/CA-37" + ) + assert ca_37["name"] == "congressional_district/CA-37" + + +class TestGetValidStateCodes: + """Tests for the get_valid_state_codes function.""" + + def test__returns_set_of_51_codes(self): + codes = get_valid_state_codes() + assert len(codes) == 51 + + def test__all_codes_are_lowercase(self): + codes = get_valid_state_codes() + for code in codes: + assert code == code.lower() + + def test__contains_california(self): + codes = get_valid_state_codes() + assert "ca" in codes + + def test__contains_dc(self): + codes = get_valid_state_codes() + assert "dc" in codes + + def test__does_not_contain_invalid_codes(self): + codes = get_valid_state_codes() + assert "xx" not in codes + assert "mb" not in codes # Manitoba (Canadian province) + + +class TestGetValidCongressionalDistricts: + """Tests for the get_valid_congressional_districts function.""" + + def test__returns_set_of_436_districts(self): + districts = get_valid_congressional_districts() + assert len(districts) == 436 + + def test__all_districts_are_lowercase(self): + districts = get_valid_congressional_districts() + for district in districts: + assert district == district.lower() + + def test__contains_california_37(self): + districts = get_valid_congressional_districts() + assert "ca-37" in districts + + def test__contains_dc_01(self): + districts = get_valid_congressional_districts() + assert "dc-01" in districts + + def test__single_digit_districts_have_leading_zero(self): + districts = get_valid_congressional_districts() + assert "ca-01" in districts + assert "ca-1" not in districts + + def test__does_not_contain_invalid_districts(self): + districts = get_valid_congressional_districts() + assert "ca-99" not in districts + assert "xx-01" not in districts + assert "cruft" not in districts diff --git a/tests/unit/data/test_model_setup.py b/tests/unit/data/test_model_setup.py new file mode 100644 index 00000000..45c69a11 --- /dev/null +++ b/tests/unit/data/test_model_setup.py @@ -0,0 +1,31 @@ +import pytest + +from policyengine_api.data.model_setup import get_dataset_version + + +class TestGetDatasetVersion: + """Tests for the get_dataset_version function.""" + + def test__given_us__returns_none(self): + result = get_dataset_version("us") + assert result is None + + def test__given_uk__returns_none(self): + result = get_dataset_version("uk") + assert result is None + + def test__given_invalid_country__raises_value_error(self): + with pytest.raises(ValueError) as exc_info: + get_dataset_version("invalid") + assert "Unknown country ID: invalid" in str(exc_info.value) + + def test__given_empty_string__raises_value_error(self): + with pytest.raises(ValueError) as exc_info: + get_dataset_version("") + assert "Unknown country ID:" in str(exc_info.value) + + def test__given_canada__raises_value_error(self): + # Canada is a valid country in the API but doesn't have dataset versioning + with pytest.raises(ValueError) as exc_info: + get_dataset_version("ca") + assert "Unknown country ID: ca" in str(exc_info.value) diff --git a/tests/unit/services/test_economy_service.py b/tests/unit/services/test_economy_service.py index 45f5bf79..779ec3e6 100644 --- a/tests/unit/services/test_economy_service.py +++ b/tests/unit/services/test_economy_service.py @@ -690,3 +690,93 @@ def test__given_non_us_state__returns_correct_region(self): result = service._setup_region(country_id, region) # Assert the expected value assert result == region + + def test__given_us_national__returns_us(self): + service = EconomyService() + result = service._setup_region("us", "us") + assert result == "us" + + def test__given_prefixed_state__returns_unchanged(self): + service = EconomyService() + result = service._setup_region("us", "state/tx") + assert result == "state/tx" + + def test__given_congressional_district__returns_unchanged(self): + service = EconomyService() + result = service._setup_region("us", "congressional_district/CA-37") + assert result == "congressional_district/CA-37" + + def test__given_lowercase_congressional_district__returns_unchanged(self): + service = EconomyService() + result = service._setup_region("us", "congressional_district/ca-37") + assert result == "congressional_district/ca-37" + + def test__given_invalid_state__raises_value_error(self): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._setup_region("us", "mb") # Manitoba is Canadian + assert "Invalid US region: 'mb'" in str(exc_info.value) + + def test__given_invalid_prefixed_state__raises_value_error(self): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._setup_region("us", "state/mb") + assert "Invalid US state: 'mb'" in str(exc_info.value) + + def test__given_invalid_congressional_district__raises_value_error(self): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._setup_region("us", "congressional_district/cruft") + assert "Invalid congressional district: 'cruft'" in str(exc_info.value) + + def test__given_invalid_prefix__raises_value_error(self): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._setup_region("us", "invalid_prefix/tx") + assert "Invalid US region: 'invalid_prefix/tx'" in str(exc_info.value) + + def test__given_invalid_bare_value__raises_value_error(self): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._setup_region("us", "invalid_value") + assert "Invalid US region: 'invalid_value'" in str(exc_info.value) + + + class TestValidateUsRegion: + """Tests for the _validate_us_region method.""" + + def test__given_valid_state__does_not_raise(self): + service = EconomyService() + # Should not raise + service._validate_us_region("state/ca") + + def test__given_valid_state_uppercase__does_not_raise(self): + service = EconomyService() + # Case-insensitive validation + service._validate_us_region("state/CA") + + def test__given_invalid_state__raises_value_error(self): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._validate_us_region("state/mb") + assert "Invalid US state: 'mb'" in str(exc_info.value) + + def test__given_valid_congressional_district__does_not_raise(self): + service = EconomyService() + service._validate_us_region("congressional_district/CA-37") + + def test__given_valid_congressional_district_lowercase__does_not_raise(self): + service = EconomyService() + service._validate_us_region("congressional_district/ca-37") + + def test__given_invalid_congressional_district__raises_value_error(self): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._validate_us_region("congressional_district/CA-99") + assert "Invalid congressional district: 'CA-99'" in str(exc_info.value) + + def test__given_nonexistent_district__raises_value_error(self): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._validate_us_region("congressional_district/cruft") + assert "Invalid congressional district: 'cruft'" in str(exc_info.value) From 3f6ce11ce1d7b3d47efd502f4982378019e3a9a1 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 9 Dec 2025 15:56:31 +0400 Subject: [PATCH 10/21] feat: Upgrade to using .py v0.7.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 239c2c67..c1232ce4 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ "policyengine_uk==2.39.0", "policyengine_us==1.457.0", "policyengine_core>=3.16.6", - "policyengine>=0.3.0", + "policyengine>=0.7.0", "pydantic", "pymysql", "python-dotenv", From 879148d9c7bb2568ac6740120de0423ade35e79a Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 9 Dec 2025 15:57:54 +0400 Subject: [PATCH 11/21] chore: Lint and changelog --- changelog_entry.yaml | 1 + .../unit/data/test_congressional_districts.py | 68 ++++++++++++++----- tests/unit/services/test_economy_service.py | 41 ++++++++--- 3 files changed, 82 insertions(+), 28 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 7a5b8fac..c83a0bbf 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -4,3 +4,4 @@ - US congressional district metadata changed: - US simulations use default datasets from .py + - Upgraded .py version to 0.7.0 \ No newline at end of file diff --git a/tests/unit/data/test_congressional_districts.py b/tests/unit/data/test_congressional_districts.py index 02fb1e13..2d8f6fce 100644 --- a/tests/unit/data/test_congressional_districts.py +++ b/tests/unit/data/test_congressional_districts.py @@ -76,11 +76,15 @@ def test__all_state_codes_are_in_state_code_to_name(self): assert district.state_code in STATE_CODE_TO_NAME def test__california_has_52_districts(self): - ca_districts = [d for d in CONGRESSIONAL_DISTRICTS if d.state_code == "CA"] + ca_districts = [ + d for d in CONGRESSIONAL_DISTRICTS if d.state_code == "CA" + ] assert len(ca_districts) == 52 def test__texas_has_38_districts(self): - tx_districts = [d for d in CONGRESSIONAL_DISTRICTS if d.state_code == "TX"] + tx_districts = [ + d for d in CONGRESSIONAL_DISTRICTS if d.state_code == "TX" + ] assert len(tx_districts) == 38 def test__at_large_states_have_1_district(self): @@ -88,23 +92,31 @@ def test__at_large_states_have_1_district(self): at_large_states = ["AK", "DE", "ND", "SD", "VT", "WY"] for state_code in at_large_states: state_districts = [ - d for d in CONGRESSIONAL_DISTRICTS if d.state_code == state_code + d + for d in CONGRESSIONAL_DISTRICTS + if d.state_code == state_code ] assert len(state_districts) == 1 assert state_districts[0].number == 1 def test__dc_has_1_district(self): - dc_districts = [d for d in CONGRESSIONAL_DISTRICTS if d.state_code == "DC"] + dc_districts = [ + d for d in CONGRESSIONAL_DISTRICTS if d.state_code == "DC" + ] assert len(dc_districts) == 1 assert dc_districts[0].number == 1 def test__dc_comes_after_delaware(self): # Find indices de_indices = [ - i for i, d in enumerate(CONGRESSIONAL_DISTRICTS) if d.state_code == "DE" + i + for i, d in enumerate(CONGRESSIONAL_DISTRICTS) + if d.state_code == "DE" ] dc_indices = [ - i for i, d in enumerate(CONGRESSIONAL_DISTRICTS) if d.state_code == "DC" + i + for i, d in enumerate(CONGRESSIONAL_DISTRICTS) + if d.state_code == "DC" ] # DC should come after all DE districts assert min(dc_indices) > max(de_indices) @@ -127,14 +139,18 @@ def test__name_has_correct_format(self): metadata = build_congressional_district_metadata() # Check first California district ca_01 = next( - item for item in metadata if item["name"] == "congressional_district/CA-01" + item + for item in metadata + if item["name"] == "congressional_district/CA-01" ) assert ca_01 is not None def test__label_has_correct_format(self): metadata = build_congressional_district_metadata() ca_01 = next( - item for item in metadata if item["name"] == "congressional_district/CA-01" + item + for item in metadata + if item["name"] == "congressional_district/CA-01" ) assert ca_01["label"] == "California's 1st congressional district" @@ -143,25 +159,39 @@ def test__ordinal_suffixes_are_correct(self): # Find specific districts to test ordinal suffixes ca_01 = next( - item for item in metadata if item["name"] == "congressional_district/CA-01" + item + for item in metadata + if item["name"] == "congressional_district/CA-01" ) ca_02 = next( - item for item in metadata if item["name"] == "congressional_district/CA-02" + item + for item in metadata + if item["name"] == "congressional_district/CA-02" ) ca_03 = next( - item for item in metadata if item["name"] == "congressional_district/CA-03" + item + for item in metadata + if item["name"] == "congressional_district/CA-03" ) ca_11 = next( - item for item in metadata if item["name"] == "congressional_district/CA-11" + item + for item in metadata + if item["name"] == "congressional_district/CA-11" ) ca_12 = next( - item for item in metadata if item["name"] == "congressional_district/CA-12" + item + for item in metadata + if item["name"] == "congressional_district/CA-12" ) ca_21 = next( - item for item in metadata if item["name"] == "congressional_district/CA-21" + item + for item in metadata + if item["name"] == "congressional_district/CA-21" ) ca_22 = next( - item for item in metadata if item["name"] == "congressional_district/CA-22" + item + for item in metadata + if item["name"] == "congressional_district/CA-22" ) assert "1st" in ca_01["label"] @@ -176,13 +206,17 @@ def test__district_numbers_have_leading_zeros(self): metadata = build_congressional_district_metadata() # Single digit districts should have leading zero ca_01 = next( - item for item in metadata if item["name"] == "congressional_district/CA-01" + item + for item in metadata + if item["name"] == "congressional_district/CA-01" ) assert ca_01["name"] == "congressional_district/CA-01" # Double digit districts should not have leading zero ca_37 = next( - item for item in metadata if item["name"] == "congressional_district/CA-37" + item + for item in metadata + if item["name"] == "congressional_district/CA-37" ) assert ca_37["name"] == "congressional_district/CA-37" diff --git a/tests/unit/services/test_economy_service.py b/tests/unit/services/test_economy_service.py index 779ec3e6..e27dba39 100644 --- a/tests/unit/services/test_economy_service.py +++ b/tests/unit/services/test_economy_service.py @@ -703,12 +703,18 @@ def test__given_prefixed_state__returns_unchanged(self): def test__given_congressional_district__returns_unchanged(self): service = EconomyService() - result = service._setup_region("us", "congressional_district/CA-37") + result = service._setup_region( + "us", "congressional_district/CA-37" + ) assert result == "congressional_district/CA-37" - def test__given_lowercase_congressional_district__returns_unchanged(self): + def test__given_lowercase_congressional_district__returns_unchanged( + self, + ): service = EconomyService() - result = service._setup_region("us", "congressional_district/ca-37") + result = service._setup_region( + "us", "congressional_district/ca-37" + ) assert result == "congressional_district/ca-37" def test__given_invalid_state__raises_value_error(self): @@ -723,17 +729,23 @@ def test__given_invalid_prefixed_state__raises_value_error(self): service._setup_region("us", "state/mb") assert "Invalid US state: 'mb'" in str(exc_info.value) - def test__given_invalid_congressional_district__raises_value_error(self): + def test__given_invalid_congressional_district__raises_value_error( + self, + ): service = EconomyService() with pytest.raises(ValueError) as exc_info: service._setup_region("us", "congressional_district/cruft") - assert "Invalid congressional district: 'cruft'" in str(exc_info.value) + assert "Invalid congressional district: 'cruft'" in str( + exc_info.value + ) def test__given_invalid_prefix__raises_value_error(self): service = EconomyService() with pytest.raises(ValueError) as exc_info: service._setup_region("us", "invalid_prefix/tx") - assert "Invalid US region: 'invalid_prefix/tx'" in str(exc_info.value) + assert "Invalid US region: 'invalid_prefix/tx'" in str( + exc_info.value + ) def test__given_invalid_bare_value__raises_value_error(self): service = EconomyService() @@ -741,7 +753,6 @@ def test__given_invalid_bare_value__raises_value_error(self): service._setup_region("us", "invalid_value") assert "Invalid US region: 'invalid_value'" in str(exc_info.value) - class TestValidateUsRegion: """Tests for the _validate_us_region method.""" @@ -765,18 +776,26 @@ def test__given_valid_congressional_district__does_not_raise(self): service = EconomyService() service._validate_us_region("congressional_district/CA-37") - def test__given_valid_congressional_district_lowercase__does_not_raise(self): + def test__given_valid_congressional_district_lowercase__does_not_raise( + self, + ): service = EconomyService() service._validate_us_region("congressional_district/ca-37") - def test__given_invalid_congressional_district__raises_value_error(self): + def test__given_invalid_congressional_district__raises_value_error( + self, + ): service = EconomyService() with pytest.raises(ValueError) as exc_info: service._validate_us_region("congressional_district/CA-99") - assert "Invalid congressional district: 'CA-99'" in str(exc_info.value) + assert "Invalid congressional district: 'CA-99'" in str( + exc_info.value + ) def test__given_nonexistent_district__raises_value_error(self): service = EconomyService() with pytest.raises(ValueError) as exc_info: service._validate_us_region("congressional_district/cruft") - assert "Invalid congressional district: 'cruft'" in str(exc_info.value) + assert "Invalid congressional district: 'cruft'" in str( + exc_info.value + ) From 7c2a14da135273efab5bfff3433c1b51b3ff2b0a Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 9 Dec 2025 20:54:25 +0400 Subject: [PATCH 12/21] fix: Just in case, specify NYC dataset --- policyengine_api/services/economy_service.py | 18 +++++- tests/unit/services/test_economy_service.py | 58 ++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/policyengine_api/services/economy_service.py b/policyengine_api/services/economy_service.py index af8f5f0f..2a9de6fe 100644 --- a/policyengine_api/services/economy_service.py +++ b/policyengine_api/services/economy_service.py @@ -457,7 +457,7 @@ def _setup_sim_options( "region": self._setup_region( country_id=country_id, region=region ), - "data": None, + "data": self._setup_data(region=region), "model_version": model_version, "data_version": data_version, } @@ -509,6 +509,22 @@ def _validate_us_region(self, region: str) -> None: f"Invalid congressional district: '{district_id}'" ) + def _setup_data(self, region: str) -> str | None: + """ + Determine the dataset to use based on the region. + + NYC simulations require a specific pooled CPS dataset. + This is specified in .py as its default, but we'll leave this + method here just in case. All other regions use their default + datasets by setting "dataset" to None. + """ + # NYC simulations must reference pooled CPS dataset + if region == "nyc": + return "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" + + # All others receive no specific data arg (use default) + return None + # Note: The following methods that interface with the ReformImpactsService # are written separately because the service relies upon mutating an original # 'computing' record to 'ok' or 'error' status, rather than creating a new record. diff --git a/tests/unit/services/test_economy_service.py b/tests/unit/services/test_economy_service.py index e27dba39..60e76b36 100644 --- a/tests/unit/services/test_economy_service.py +++ b/tests/unit/services/test_economy_service.py @@ -753,6 +753,64 @@ def test__given_invalid_bare_value__raises_value_error(self): service._setup_region("us", "invalid_value") assert "Invalid US region: 'invalid_value'" in str(exc_info.value) + class TestSetupData: + def test__given_nyc_region__returns_pooled_cps(self): + # Test with NYC region - should return pooled CPS dataset + region = "nyc" + + # Create an instance of the class + service = EconomyService() + # Call the method + result = service._setup_data(region) + # Assert the expected value + assert ( + result == "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" + ) + + def test__given_us_state__returns_none(self): + # Test with US state - should return None + region = "ca" + + # Create an instance of the class + service = EconomyService() + # Call the method + result = service._setup_data(region) + # Assert the expected value + assert result is None + + def test__given_us_nationwide__returns_none(self): + # Test with US nationwide region + region = "us" + + # Create an instance of the class + service = EconomyService() + # Call the method + result = service._setup_data(region) + # Assert the expected value + assert result is None + + def test__given_congressional_district__returns_none(self): + # Test with congressional district - should return None + region = "congressional_district/TX-14" + + # Create an instance of the class + service = EconomyService() + # Call the method + result = service._setup_data(region) + # Assert the expected value + assert result is None + + def test__given_state_prefix__returns_none(self): + # Test with prefixed state - should return None + region = "state/ut" + + # Create an instance of the class + service = EconomyService() + # Call the method + result = service._setup_data(region) + # Assert the expected value + assert result is None + class TestValidateUsRegion: """Tests for the _validate_us_region method.""" From 261180b1bc570aa5e81f00130ea1f90c19acba66 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Tue, 9 Dec 2025 20:55:30 +0400 Subject: [PATCH 13/21] chore: Lint --- policyengine_api/services/economy_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_api/services/economy_service.py b/policyengine_api/services/economy_service.py index 2a9de6fe..0fd80af0 100644 --- a/policyengine_api/services/economy_service.py +++ b/policyengine_api/services/economy_service.py @@ -515,7 +515,7 @@ def _setup_data(self, region: str) -> str | None: NYC simulations require a specific pooled CPS dataset. This is specified in .py as its default, but we'll leave this - method here just in case. All other regions use their default + method here just in case. All other regions use their default datasets by setting "dataset" to None. """ # NYC simulations must reference pooled CPS dataset From 578f2992a4830612f783d47c61848f8e638ddf4b Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 10 Dec 2025 12:53:02 +0400 Subject: [PATCH 14/21] fix: Properly handle at-large states --- .../data/congressional_districts.py | 8 ++++ .../unit/data/test_congressional_districts.py | 41 ++++++++++++++++++- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/policyengine_api/data/congressional_districts.py b/policyengine_api/data/congressional_districts.py index af17fb31..46ad7946 100644 --- a/policyengine_api/data/congressional_districts.py +++ b/policyengine_api/data/congressional_districts.py @@ -93,6 +93,9 @@ class CongressionalDistrictMetadataItem(BaseModel): ) +# States with only one at-large congressional district +AT_LARGE_STATES: set[str] = {"AK", "DE", "DC", "ND", "SD", "VT", "WY"} + # All 435 US Congressional districts based on 2020 Census apportionment CONGRESSIONAL_DISTRICTS: list[CongressionalDistrictMetadataItem] = [ # Alabama - 7 districts @@ -634,14 +637,19 @@ def _build_district_name(state_code: str, number: int) -> str: def _build_district_label(state_code: str, number: int) -> str: """ Build the district label in the format: 's th congressional district + For at-large districts (states with only 1 district), use: 's at-large congressional district Examples: ("CA", 1) -> "California's 1st congressional district" ("NY", 2) -> "New York's 2nd congressional district" ("TX", 3) -> "Texas's 3rd congressional district" ("FL", 21) -> "Florida's 21st congressional district" + ("AK", 1) -> "Alaska's at-large congressional district" + ("WY", 1) -> "Wyoming's at-large congressional district" """ state_name = STATE_CODE_TO_NAME[state_code] + if state_code in AT_LARGE_STATES: + return f"{state_name}'s at-large congressional district" ordinal_suffix = _get_ordinal_suffix(number) return f"{state_name}'s {number}{ordinal_suffix} congressional district" diff --git a/tests/unit/data/test_congressional_districts.py b/tests/unit/data/test_congressional_districts.py index 2d8f6fce..0c2e1bf7 100644 --- a/tests/unit/data/test_congressional_districts.py +++ b/tests/unit/data/test_congressional_districts.py @@ -5,6 +5,7 @@ CongressionalDistrictMetadataItem, CONGRESSIONAL_DISTRICTS, STATE_CODE_TO_NAME, + AT_LARGE_STATES, build_congressional_district_metadata, get_valid_state_codes, get_valid_congressional_districts, @@ -88,8 +89,8 @@ def test__texas_has_38_districts(self): assert len(tx_districts) == 38 def test__at_large_states_have_1_district(self): - # States with only 1 at-large representative - at_large_states = ["AK", "DE", "ND", "SD", "VT", "WY"] + # States with only 1 at-large representative (excluding DC which is special) + at_large_states = [s for s in AT_LARGE_STATES if s != "DC"] for state_code in at_large_states: state_districts = [ d @@ -220,6 +221,42 @@ def test__district_numbers_have_leading_zeros(self): ) assert ca_37["name"] == "congressional_district/CA-37" + def test__at_large_states_have_at_large_label(self): + metadata = build_congressional_district_metadata() + # All at-large states should have "at-large" in label + for state_code in AT_LARGE_STATES: + district = next( + item + for item in metadata + if item["name"] + == f"congressional_district/{state_code}-01" + ) + assert ( + "at-large congressional district" in district["label"] + ), f"{state_code} should have at-large label" + + def test__alaska_at_large_label(self): + metadata = build_congressional_district_metadata() + ak_01 = next( + item + for item in metadata + if item["name"] == "congressional_district/AK-01" + ) + assert ( + ak_01["label"] == "Alaska's at-large congressional district" + ) + + def test__wyoming_at_large_label(self): + metadata = build_congressional_district_metadata() + wy_01 = next( + item + for item in metadata + if item["name"] == "congressional_district/WY-01" + ) + assert ( + wy_01["label"] == "Wyoming's at-large congressional district" + ) + class TestGetValidStateCodes: """Tests for the get_valid_state_codes function.""" From ad7c70a3dfd81655406b791f73b7843415379204 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 10 Dec 2025 13:15:48 +0400 Subject: [PATCH 15/21] feat: Add state info to district items --- .../data/congressional_districts.py | 13 ++++++-- .../unit/data/test_congressional_districts.py | 30 ++++++++++++++++++- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/policyengine_api/data/congressional_districts.py b/policyengine_api/data/congressional_districts.py index 46ad7946..990f4f54 100644 --- a/policyengine_api/data/congressional_districts.py +++ b/policyengine_api/data/congressional_districts.py @@ -658,15 +658,20 @@ def build_congressional_district_metadata() -> list[dict]: """ Build the complete congressional district metadata structure for use in country.py. - Returns a list of dictionaries with 'name' and 'label' keys, formatted as: + Returns a list of dictionaries with 'name', 'label', 'state_abbreviation', + and 'state_name' keys, formatted as: [ { "name": "congressional_district/CA-01", - "label": "California's 1st congressional district" + "label": "California's 1st congressional district", + "state_abbreviation": "CA", + "state_name": "California" }, { "name": "congressional_district/CA-02", - "label": "California's 2nd congressional district" + "label": "California's 2nd congressional district", + "state_abbreviation": "CA", + "state_name": "California" }, ... ] @@ -680,6 +685,8 @@ def build_congressional_district_metadata() -> list[dict]: "label": _build_district_label( district.state_code, district.number ), + "state_abbreviation": district.state_code, + "state_name": STATE_CODE_TO_NAME[district.state_code], } for district in CONGRESSIONAL_DISTRICTS ] diff --git a/tests/unit/data/test_congressional_districts.py b/tests/unit/data/test_congressional_districts.py index 0c2e1bf7..a511e74a 100644 --- a/tests/unit/data/test_congressional_districts.py +++ b/tests/unit/data/test_congressional_districts.py @@ -130,11 +130,13 @@ def test__returns_list_of_436_items(self): metadata = build_congressional_district_metadata() assert len(metadata) == 436 - def test__each_item_has_name_and_label_keys(self): + def test__each_item_has_required_keys(self): metadata = build_congressional_district_metadata() for item in metadata: assert "name" in item assert "label" in item + assert "state_abbreviation" in item + assert "state_name" in item def test__name_has_correct_format(self): metadata = build_congressional_district_metadata() @@ -155,6 +157,32 @@ def test__label_has_correct_format(self): ) assert ca_01["label"] == "California's 1st congressional district" + def test__state_abbreviation_is_uppercase(self): + metadata = build_congressional_district_metadata() + for item in metadata: + assert item["state_abbreviation"] == item["state_abbreviation"].upper() + assert len(item["state_abbreviation"]) == 2 + + def test__state_name_matches_abbreviation(self): + metadata = build_congressional_district_metadata() + ca_01 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-01" + ) + assert ca_01["state_abbreviation"] == "CA" + assert ca_01["state_name"] == "California" + + def test__dc_state_fields(self): + metadata = build_congressional_district_metadata() + dc_01 = next( + item + for item in metadata + if item["name"] == "congressional_district/DC-01" + ) + assert dc_01["state_abbreviation"] == "DC" + assert dc_01["state_name"] == "District of Columbia" + def test__ordinal_suffixes_are_correct(self): metadata = build_congressional_district_metadata() From 7d156f3785ca6fd6242a309aeefd5211a2887829 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 10 Dec 2025 15:20:09 +0400 Subject: [PATCH 16/21] fix: Prefix US regions, add type indicator --- policyengine_api/constants.py | 16 +++ policyengine_api/country.py | 117 +++++++++--------- .../data/congressional_districts.py | 5 +- policyengine_api/services/economy_service.py | 7 +- .../unit/data/test_congressional_districts.py | 6 + tests/unit/services/test_economy_service.py | 29 ++++- tests/unit/services/test_metadata_service.py | 37 +++++- 7 files changed, 154 insertions(+), 63 deletions(-) diff --git a/policyengine_api/constants.py b/policyengine_api/constants.py index 17b7e3fb..db87e948 100644 --- a/policyengine_api/constants.py +++ b/policyengine_api/constants.py @@ -25,11 +25,27 @@ except: COUNTRY_PACKAGE_VERSIONS = {country: "0.0.0" for country in COUNTRIES} +# Valid region types for each country +# These define the geographic scope categories for regions +US_REGION_TYPES = ( + "national", # National level (e.g., "us") + "state", # US states (e.g., "state/ca", "state/ny") + "city", # US cities (e.g., "city/nyc") + "congressional_district", # US congressional districts (e.g., "congressional_district/CA-37") +) + +UK_REGION_TYPES = ( + "national", # National level (e.g., "uk") + "country", # UK countries (e.g., "country/england", "country/scotland") + "constituency", # UK parliamentary constituencies (e.g., "constituency/Aldershot") +) + # Valid region prefixes for each country # These define the allowed geographic scope prefixes in region names REGION_PREFIXES = { "us": [ "state/", # US states (e.g., "state/ca", "state/ny") + "city/", # US cities (e.g., "city/nyc") "congressional_district/", # US congressional districts (e.g., "congressional_district/CA-37") ], "uk": [ diff --git a/policyengine_api/country.py b/policyengine_api/country.py index e5c9b284..4a9691d0 100644 --- a/policyengine_api/country.py +++ b/policyengine_api/country.py @@ -74,17 +74,18 @@ def build_microsimulation_options(self) -> dict: ) constituency_names = pd.read_csv(constituency_names_path) region = [ - dict(name="uk", label="the UK"), - dict(name="country/england", label="England"), - dict(name="country/scotland", label="Scotland"), - dict(name="country/wales", label="Wales"), - dict(name="country/ni", label="Northern Ireland"), + dict(name="uk", label="the UK", type="national"), + dict(name="country/england", label="England", type="country"), + dict(name="country/scotland", label="Scotland", type="country"), + dict(name="country/wales", label="Wales", type="country"), + dict(name="country/ni", label="Northern Ireland", type="country"), ] for i in range(len(constituency_names)): region.append( dict( name=f"constituency/{constituency_names.iloc[i]['name']}", label=constituency_names.iloc[i]["name"], + type="constituency", ) ) time_period = [ @@ -102,59 +103,59 @@ def build_microsimulation_options(self) -> dict: options["datasets"] = datasets elif self.country_id == "us": region = [ - dict(name="us", label="the US"), - dict(name="al", label="Alabama"), - dict(name="ak", label="Alaska"), - dict(name="az", label="Arizona"), - dict(name="ar", label="Arkansas"), - dict(name="ca", label="California"), - dict(name="co", label="Colorado"), - dict(name="ct", label="Connecticut"), - dict(name="de", label="Delaware"), - dict(name="dc", label="District of Columbia"), - dict(name="fl", label="Florida"), - dict(name="ga", label="Georgia"), - dict(name="hi", label="Hawaii"), - dict(name="id", label="Idaho"), - dict(name="il", label="Illinois"), - dict(name="in", label="Indiana"), - dict(name="ia", label="Iowa"), - dict(name="ks", label="Kansas"), - dict(name="ky", label="Kentucky"), - dict(name="la", label="Louisiana"), - dict(name="me", label="Maine"), - dict(name="md", label="Maryland"), - dict(name="ma", label="Massachusetts"), - dict(name="mi", label="Michigan"), - dict(name="mn", label="Minnesota"), - dict(name="ms", label="Mississippi"), - dict(name="mo", label="Missouri"), - dict(name="mt", label="Montana"), - dict(name="ne", label="Nebraska"), - dict(name="nv", label="Nevada"), - dict(name="nh", label="New Hampshire"), - dict(name="nj", label="New Jersey"), - dict(name="nm", label="New Mexico"), - dict(name="ny", label="New York"), - dict(name="nyc", label="New York City"), # Region, not State - dict(name="nc", label="North Carolina"), - dict(name="nd", label="North Dakota"), - dict(name="oh", label="Ohio"), - dict(name="ok", label="Oklahoma"), - dict(name="or", label="Oregon"), - dict(name="pa", label="Pennsylvania"), - dict(name="ri", label="Rhode Island"), - dict(name="sc", label="South Carolina"), - dict(name="sd", label="South Dakota"), - dict(name="tn", label="Tennessee"), - dict(name="tx", label="Texas"), - dict(name="ut", label="Utah"), - dict(name="vt", label="Vermont"), - dict(name="va", label="Virginia"), - dict(name="wa", label="Washington"), - dict(name="wv", label="West Virginia"), - dict(name="wi", label="Wisconsin"), - dict(name="wy", label="Wyoming"), + dict(name="us", label="the US", type="national"), + dict(name="state/al", label="Alabama", type="state"), + dict(name="state/ak", label="Alaska", type="state"), + dict(name="state/az", label="Arizona", type="state"), + dict(name="state/ar", label="Arkansas", type="state"), + dict(name="state/ca", label="California", type="state"), + dict(name="state/co", label="Colorado", type="state"), + dict(name="state/ct", label="Connecticut", type="state"), + dict(name="state/de", label="Delaware", type="state"), + dict(name="state/dc", label="District of Columbia", type="state"), + dict(name="state/fl", label="Florida", type="state"), + dict(name="state/ga", label="Georgia", type="state"), + dict(name="state/hi", label="Hawaii", type="state"), + dict(name="state/id", label="Idaho", type="state"), + dict(name="state/il", label="Illinois", type="state"), + dict(name="state/in", label="Indiana", type="state"), + dict(name="state/ia", label="Iowa", type="state"), + dict(name="state/ks", label="Kansas", type="state"), + dict(name="state/ky", label="Kentucky", type="state"), + dict(name="state/la", label="Louisiana", type="state"), + dict(name="state/me", label="Maine", type="state"), + dict(name="state/md", label="Maryland", type="state"), + dict(name="state/ma", label="Massachusetts", type="state"), + dict(name="state/mi", label="Michigan", type="state"), + dict(name="state/mn", label="Minnesota", type="state"), + dict(name="state/ms", label="Mississippi", type="state"), + dict(name="state/mo", label="Missouri", type="state"), + dict(name="state/mt", label="Montana", type="state"), + dict(name="state/ne", label="Nebraska", type="state"), + dict(name="state/nv", label="Nevada", type="state"), + dict(name="state/nh", label="New Hampshire", type="state"), + dict(name="state/nj", label="New Jersey", type="state"), + dict(name="state/nm", label="New Mexico", type="state"), + dict(name="state/ny", label="New York", type="state"), + dict(name="city/nyc", label="New York City", type="city"), + dict(name="state/nc", label="North Carolina", type="state"), + dict(name="state/nd", label="North Dakota", type="state"), + dict(name="state/oh", label="Ohio", type="state"), + dict(name="state/ok", label="Oklahoma", type="state"), + dict(name="state/or", label="Oregon", type="state"), + dict(name="state/pa", label="Pennsylvania", type="state"), + dict(name="state/ri", label="Rhode Island", type="state"), + dict(name="state/sc", label="South Carolina", type="state"), + dict(name="state/sd", label="South Dakota", type="state"), + dict(name="state/tn", label="Tennessee", type="state"), + dict(name="state/tx", label="Texas", type="state"), + dict(name="state/ut", label="Utah", type="state"), + dict(name="state/vt", label="Vermont", type="state"), + dict(name="state/va", label="Virginia", type="state"), + dict(name="state/wa", label="Washington", type="state"), + dict(name="state/wv", label="West Virginia", type="state"), + dict(name="state/wi", label="Wisconsin", type="state"), + dict(name="state/wy", label="Wyoming", type="state"), ] # Add all 436 congressional districts (435 voting + DC) region.extend(build_congressional_district_metadata()) diff --git a/policyengine_api/data/congressional_districts.py b/policyengine_api/data/congressional_districts.py index 990f4f54..6f26fa78 100644 --- a/policyengine_api/data/congressional_districts.py +++ b/policyengine_api/data/congressional_districts.py @@ -658,18 +658,20 @@ def build_congressional_district_metadata() -> list[dict]: """ Build the complete congressional district metadata structure for use in country.py. - Returns a list of dictionaries with 'name', 'label', 'state_abbreviation', + Returns a list of dictionaries with 'name', 'label', 'type', 'state_abbreviation', and 'state_name' keys, formatted as: [ { "name": "congressional_district/CA-01", "label": "California's 1st congressional district", + "type": "congressional_district", "state_abbreviation": "CA", "state_name": "California" }, { "name": "congressional_district/CA-02", "label": "California's 2nd congressional district", + "type": "congressional_district", "state_abbreviation": "CA", "state_name": "California" }, @@ -685,6 +687,7 @@ def build_congressional_district_metadata() -> list[dict]: "label": _build_district_label( district.state_code, district.number ), + "type": "congressional_district", "state_abbreviation": district.state_code, "state_name": STATE_CODE_TO_NAME[district.state_code], } diff --git a/policyengine_api/services/economy_service.py b/policyengine_api/services/economy_service.py index 0fd80af0..3f5bfd50 100644 --- a/policyengine_api/services/economy_service.py +++ b/policyengine_api/services/economy_service.py @@ -473,6 +473,10 @@ def _setup_region(self, country_id: str, region: str) -> str: # For US regions (excluding the national-level "us") if country_id == "us" and region != "us": + # Handle legacy "nyc" format (convert to "city/nyc") + if region == "nyc": + return "city/nyc" + # Check if region already has a valid prefix valid_prefixes = REGION_PREFIXES.get(country_id, []) has_valid_prefix = any( @@ -519,7 +523,8 @@ def _setup_data(self, region: str) -> str | None: datasets by setting "dataset" to None. """ # NYC simulations must reference pooled CPS dataset - if region == "nyc": + # Handle both legacy "nyc" and new "city/nyc" formats + if region in ("nyc", "city/nyc"): return "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" # All others receive no specific data arg (use default) diff --git a/tests/unit/data/test_congressional_districts.py b/tests/unit/data/test_congressional_districts.py index a511e74a..4a273a09 100644 --- a/tests/unit/data/test_congressional_districts.py +++ b/tests/unit/data/test_congressional_districts.py @@ -135,6 +135,7 @@ def test__each_item_has_required_keys(self): for item in metadata: assert "name" in item assert "label" in item + assert "type" in item assert "state_abbreviation" in item assert "state_name" in item @@ -183,6 +184,11 @@ def test__dc_state_fields(self): assert dc_01["state_abbreviation"] == "DC" assert dc_01["state_name"] == "District of Columbia" + def test__type_is_congressional_district(self): + metadata = build_congressional_district_metadata() + for item in metadata: + assert item["type"] == "congressional_district" + def test__ordinal_suffixes_are_correct(self): metadata = build_congressional_district_metadata() diff --git a/tests/unit/services/test_economy_service.py b/tests/unit/services/test_economy_service.py index 60e76b36..a76c0056 100644 --- a/tests/unit/services/test_economy_service.py +++ b/tests/unit/services/test_economy_service.py @@ -753,9 +753,21 @@ def test__given_invalid_bare_value__raises_value_error(self): service._setup_region("us", "invalid_value") assert "Invalid US region: 'invalid_value'" in str(exc_info.value) + def test__given_legacy_nyc__returns_city_nyc(self): + # Test legacy "nyc" format gets converted to "city/nyc" + service = EconomyService() + result = service._setup_region("us", "nyc") + assert result == "city/nyc" + + def test__given_city_nyc__returns_unchanged(self): + # Test new "city/nyc" format passes through unchanged + service = EconomyService() + result = service._setup_region("us", "city/nyc") + assert result == "city/nyc" + class TestSetupData: - def test__given_nyc_region__returns_pooled_cps(self): - # Test with NYC region - should return pooled CPS dataset + def test__given_legacy_nyc_region__returns_pooled_cps(self): + # Test with legacy NYC region format - should return pooled CPS dataset region = "nyc" # Create an instance of the class @@ -767,6 +779,19 @@ def test__given_nyc_region__returns_pooled_cps(self): result == "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" ) + def test__given_city_nyc_region__returns_pooled_cps(self): + # Test with new city/nyc region format - should return pooled CPS dataset + region = "city/nyc" + + # Create an instance of the class + service = EconomyService() + # Call the method + result = service._setup_data(region) + # Assert the expected value + assert ( + result == "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" + ) + def test__given_us_state__returns_none(self): # Test with US state - should return None region = "ca" diff --git a/tests/unit/services/test_metadata_service.py b/tests/unit/services/test_metadata_service.py index 80163b4c..43d9bde6 100644 --- a/tests/unit/services/test_metadata_service.py +++ b/tests/unit/services/test_metadata_service.py @@ -46,7 +46,18 @@ def test_get_metadata_empty_country_id(self): "country/ni", ], ), - ("us", 2, ["us", "ca", "ny", "tx", "fl"]), + ( + "us", + 2, + [ + "us", + "state/ca", + "state/ny", + "state/tx", + "state/fl", + "city/nyc", + ], + ), ("ca", 3, ["ca"]), ("ng", 4, ["ng"]), ("il", 5, ["il"]), @@ -108,3 +119,27 @@ def test_verify_metadata_for_given_country( # Verify datasets exist and are of correct type assert "datasets" in metadata["economy_options"] assert isinstance(metadata["economy_options"]["datasets"], list) + + @pytest.mark.parametrize( + "country_id, expected_types", + [ + ("uk", ["national", "country", "constituency"]), + ("us", ["national", "state", "city", "congressional_district"]), + ], + ) + def test_verify_region_types_for_given_country( + self, country_id, expected_types + ): + """ + Verifies that all regions for UK and US have a 'type' field + with valid values. + """ + service = MetadataService() + metadata = service.get_metadata(country_id) + + regions = metadata["economy_options"]["region"] + for region in regions: + assert "type" in region, f"Region '{region['name']}' missing 'type' field" + assert region["type"] in expected_types, ( + f"Region '{region['name']}' has invalid type '{region['type']}'" + ) From f6cc1cbd5ee6b82d825afaca406b0469da348dfd Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 10 Dec 2025 16:44:23 +0400 Subject: [PATCH 17/21] fix: Normalize US regions --- .../data/congressional_districts.py | 47 +++++++++++++++++++ policyengine_api/services/economy_service.py | 33 +++++-------- .../unit/data/test_congressional_districts.py | 46 ++++++++++++++++++ 3 files changed, 106 insertions(+), 20 deletions(-) diff --git a/policyengine_api/data/congressional_districts.py b/policyengine_api/data/congressional_districts.py index 6f26fa78..7aa54ab8 100644 --- a/policyengine_api/data/congressional_districts.py +++ b/policyengine_api/data/congressional_districts.py @@ -718,3 +718,50 @@ def get_valid_congressional_districts() -> set[str]: f"{district.state_code.lower()}-{_format_district_number(district.number)}" for district in CONGRESSIONAL_DISTRICTS } + + +def normalize_us_region(region: str) -> str: + """ + Normalize a US region string to the standard prefixed format. + + This function handles legacy region formats (bare state codes like "ca") + and converts them to the standard format ("state/ca"). It should be called + as early as possible when processing region inputs to ensure consistent + handling throughout the system. + + Args: + region: A region string that may be in legacy or standard format. + Examples: "ca", "state/ca", "nyc", "city/nyc", + "congressional_district/CA-01", "us" + + Returns: + The normalized region string with appropriate prefix. + Examples: "state/ca", "city/nyc", "congressional_district/CA-01", "us" + + Note: + This function does NOT validate that the region is valid - it only + normalizes the format. Use _validate_us_region for validation. + """ + # Already has a valid prefix - return as-is + if ( + region.startswith("state/") + or region.startswith("city/") + or region.startswith("congressional_district/") + ): + return region + + # National level - no prefix needed + if region == "us": + return region + + # Legacy NYC format + if region == "nyc": + return "city/nyc" + + # Legacy bare state code (e.g., "ca", "tx", "NY") + # Check if it's a valid state code before adding prefix + if region.lower() in get_valid_state_codes(): + return f"state/{region}" + + # Unknown format - return as-is and let validation catch it + return region diff --git a/policyengine_api/services/economy_service.py b/policyengine_api/services/economy_service.py index 3f5bfd50..d7efcdc6 100644 --- a/policyengine_api/services/economy_service.py +++ b/policyengine_api/services/economy_service.py @@ -12,6 +12,7 @@ from policyengine_api.data.congressional_districts import ( get_valid_state_codes, get_valid_congressional_districts, + normalize_us_region, ) from policyengine.simulation import SimulationOptions from google.cloud.workflows import executions_v1 @@ -471,28 +472,13 @@ def _setup_region(self, country_id: str, region: str) -> str: Raises ValueError for invalid regions. """ - # For US regions (excluding the national-level "us") - if country_id == "us" and region != "us": - # Handle legacy "nyc" format (convert to "city/nyc") - if region == "nyc": - return "city/nyc" + # For US regions, normalize first (handles legacy formats like "ca" -> "state/ca") + if country_id == "us": + region = normalize_us_region(region) - # Check if region already has a valid prefix - valid_prefixes = REGION_PREFIXES.get(country_id, []) - has_valid_prefix = any( - region.startswith(prefix) for prefix in valid_prefixes - ) - - if has_valid_prefix: - # Validate the region value after the prefix + # Validate the normalized region (skip validation for national "us") + if region != "us": self._validate_us_region(region) - return region - else: - # Legacy format: bare region codes (e.g., "tx") need "state/" prefix - # Validate it's a real state code before adding prefix - if region.lower() not in get_valid_state_codes(): - raise ValueError(f"Invalid US region: '{region}'") - return "state/" + region return region @@ -506,12 +492,19 @@ def _validate_us_region(self, region: str) -> None: state_code = region[len("state/") :] if state_code.lower() not in get_valid_state_codes(): raise ValueError(f"Invalid US state: '{state_code}'") + elif region.startswith("city/"): + # Currently only NYC is supported + city_code = region[len("city/") :] + if city_code != "nyc": + raise ValueError(f"Invalid US city: '{city_code}'") elif region.startswith("congressional_district/"): district_id = region[len("congressional_district/") :] if district_id.lower() not in get_valid_congressional_districts(): raise ValueError( f"Invalid congressional district: '{district_id}'" ) + else: + raise ValueError(f"Invalid US region: '{region}'") def _setup_data(self, region: str) -> str | None: """ diff --git a/tests/unit/data/test_congressional_districts.py b/tests/unit/data/test_congressional_districts.py index 4a273a09..8774bc2e 100644 --- a/tests/unit/data/test_congressional_districts.py +++ b/tests/unit/data/test_congressional_districts.py @@ -9,6 +9,7 @@ build_congressional_district_metadata, get_valid_state_codes, get_valid_congressional_districts, + normalize_us_region, ) @@ -348,3 +349,48 @@ def test__does_not_contain_invalid_districts(self): assert "ca-99" not in districts assert "xx-01" not in districts assert "cruft" not in districts + + +class TestNormalizeUsRegion: + """Tests for the normalize_us_region function.""" + + def test__national_us_unchanged(self): + assert normalize_us_region("us") == "us" + + def test__prefixed_state_unchanged(self): + assert normalize_us_region("state/ca") == "state/ca" + assert normalize_us_region("state/TX") == "state/TX" + + def test__prefixed_city_unchanged(self): + assert normalize_us_region("city/nyc") == "city/nyc" + + def test__prefixed_congressional_district_unchanged(self): + assert ( + normalize_us_region("congressional_district/CA-37") + == "congressional_district/CA-37" + ) + assert ( + normalize_us_region("congressional_district/tx-14") + == "congressional_district/tx-14" + ) + + def test__legacy_nyc_converted(self): + assert normalize_us_region("nyc") == "city/nyc" + + def test__legacy_state_code_lowercase_converted(self): + assert normalize_us_region("ca") == "state/ca" + assert normalize_us_region("tx") == "state/tx" + assert normalize_us_region("ny") == "state/ny" + + def test__legacy_state_code_uppercase_converted(self): + assert normalize_us_region("CA") == "state/CA" + assert normalize_us_region("TX") == "state/TX" + + def test__legacy_dc_converted(self): + assert normalize_us_region("dc") == "state/dc" + assert normalize_us_region("DC") == "state/DC" + + def test__unknown_region_returned_unchanged(self): + # Unknown regions are returned as-is for validation to catch + assert normalize_us_region("invalid") == "invalid" + assert normalize_us_region("mb") == "mb" # Manitoba (Canadian) From 1e23ae704dd88f88c1f7c0ad22b102142d1d65a6 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 10 Dec 2025 17:03:07 +0400 Subject: [PATCH 18/21] chore: Lint --- policyengine_api/country.py | 12 +++++++++--- tests/unit/data/test_congressional_districts.py | 16 +++++++--------- tests/unit/services/test_metadata_service.py | 10 ++++++---- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/policyengine_api/country.py b/policyengine_api/country.py index 4a9691d0..44cb4747 100644 --- a/policyengine_api/country.py +++ b/policyengine_api/country.py @@ -76,9 +76,13 @@ def build_microsimulation_options(self) -> dict: region = [ dict(name="uk", label="the UK", type="national"), dict(name="country/england", label="England", type="country"), - dict(name="country/scotland", label="Scotland", type="country"), + dict( + name="country/scotland", label="Scotland", type="country" + ), dict(name="country/wales", label="Wales", type="country"), - dict(name="country/ni", label="Northern Ireland", type="country"), + dict( + name="country/ni", label="Northern Ireland", type="country" + ), ] for i in range(len(constituency_names)): region.append( @@ -112,7 +116,9 @@ def build_microsimulation_options(self) -> dict: dict(name="state/co", label="Colorado", type="state"), dict(name="state/ct", label="Connecticut", type="state"), dict(name="state/de", label="Delaware", type="state"), - dict(name="state/dc", label="District of Columbia", type="state"), + dict( + name="state/dc", label="District of Columbia", type="state" + ), dict(name="state/fl", label="Florida", type="state"), dict(name="state/ga", label="Georgia", type="state"), dict(name="state/hi", label="Hawaii", type="state"), diff --git a/tests/unit/data/test_congressional_districts.py b/tests/unit/data/test_congressional_districts.py index 8774bc2e..05819916 100644 --- a/tests/unit/data/test_congressional_districts.py +++ b/tests/unit/data/test_congressional_districts.py @@ -162,7 +162,10 @@ def test__label_has_correct_format(self): def test__state_abbreviation_is_uppercase(self): metadata = build_congressional_district_metadata() for item in metadata: - assert item["state_abbreviation"] == item["state_abbreviation"].upper() + assert ( + item["state_abbreviation"] + == item["state_abbreviation"].upper() + ) assert len(item["state_abbreviation"]) == 2 def test__state_name_matches_abbreviation(self): @@ -263,8 +266,7 @@ def test__at_large_states_have_at_large_label(self): district = next( item for item in metadata - if item["name"] - == f"congressional_district/{state_code}-01" + if item["name"] == f"congressional_district/{state_code}-01" ) assert ( "at-large congressional district" in district["label"] @@ -277,9 +279,7 @@ def test__alaska_at_large_label(self): for item in metadata if item["name"] == "congressional_district/AK-01" ) - assert ( - ak_01["label"] == "Alaska's at-large congressional district" - ) + assert ak_01["label"] == "Alaska's at-large congressional district" def test__wyoming_at_large_label(self): metadata = build_congressional_district_metadata() @@ -288,9 +288,7 @@ def test__wyoming_at_large_label(self): for item in metadata if item["name"] == "congressional_district/WY-01" ) - assert ( - wy_01["label"] == "Wyoming's at-large congressional district" - ) + assert wy_01["label"] == "Wyoming's at-large congressional district" class TestGetValidStateCodes: diff --git a/tests/unit/services/test_metadata_service.py b/tests/unit/services/test_metadata_service.py index 43d9bde6..ac33d525 100644 --- a/tests/unit/services/test_metadata_service.py +++ b/tests/unit/services/test_metadata_service.py @@ -139,7 +139,9 @@ def test_verify_region_types_for_given_country( regions = metadata["economy_options"]["region"] for region in regions: - assert "type" in region, f"Region '{region['name']}' missing 'type' field" - assert region["type"] in expected_types, ( - f"Region '{region['name']}' has invalid type '{region['type']}'" - ) + assert ( + "type" in region + ), f"Region '{region['name']}' missing 'type' field" + assert ( + region["type"] in expected_types + ), f"Region '{region['name']}' has invalid type '{region['type']}'" From 1a05dfb7b9f1de60965e2666cc991fb0d76d0da2 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 11 Dec 2025 13:57:52 +0400 Subject: [PATCH 19/21] feat: Explicitly pass default dataset into API setup --- policyengine_api/services/economy_service.py | 51 +++++++++++--------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/policyengine_api/services/economy_service.py b/policyengine_api/services/economy_service.py index d7efcdc6..9f3aa042 100644 --- a/policyengine_api/services/economy_service.py +++ b/policyengine_api/services/economy_service.py @@ -15,6 +15,7 @@ normalize_us_region, ) from policyengine.simulation import SimulationOptions +from policyengine.utils.data.datasets import get_default_dataset from google.cloud.workflows import executions_v1 import json import datetime @@ -146,6 +147,11 @@ def get_economic_impact( try: + # Normalize region early for US; this allows us to accommodate legacy + # regions that don't contain a region prefix. + if country_id == "us": + region = normalize_us_region(region) + # Set up logging process_id: str = self._create_process_id() @@ -458,7 +464,9 @@ def _setup_sim_options( "region": self._setup_region( country_id=country_id, region=region ), - "data": self._setup_data(region=region), + "data": self._setup_data( + country_id=country_id, region=region + ), "model_version": model_version, "data_version": data_version, } @@ -466,19 +474,15 @@ def _setup_sim_options( def _setup_region(self, country_id: str, region: str) -> str: """ - Convert API v1 'region' option to API v2-compatible 'region' option. + Validate the region for the given country. - Validates that the region is a known valid region for the country. + Assumes region has already been normalized (e.g., "ca" -> "state/ca"). Raises ValueError for invalid regions. """ - # For US regions, normalize first (handles legacy formats like "ca" -> "state/ca") - if country_id == "us": - region = normalize_us_region(region) - - # Validate the normalized region (skip validation for national "us") - if region != "us": - self._validate_us_region(region) + # For US regions, validate (skip validation for national "us") + if country_id == "us" and region != "us": + self._validate_us_region(region) return region @@ -506,22 +510,23 @@ def _validate_us_region(self, region: str) -> None: else: raise ValueError(f"Invalid US region: '{region}'") - def _setup_data(self, region: str) -> str | None: + def _setup_data(self, country_id: str, region: str) -> str: """ - Determine the dataset to use based on the region. + Determine the dataset to use based on the country and region. - NYC simulations require a specific pooled CPS dataset. - This is specified in .py as its default, but we'll leave this - method here just in case. All other regions use their default - datasets by setting "dataset" to None. + Uses policyengine's get_default_dataset to resolve the appropriate + GCS path, making the dataset visible in GCP Console workflow inputs. """ - # NYC simulations must reference pooled CPS dataset - # Handle both legacy "nyc" and new "city/nyc" formats - if region in ("nyc", "city/nyc"): - return "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" - - # All others receive no specific data arg (use default) - return None + try: + return get_default_dataset(country_id, region) + except ValueError as e: + logger.log_struct( + { + "message": f"Error getting default dataset for country={country_id}, region={region}: {str(e)}", + }, + severity="ERROR", + ) + raise # Note: The following methods that interface with the ReformImpactsService # are written separately because the service relies upon mutating an original From 0319a10524d460882500fb6892247c2a8d2fba13 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 11 Dec 2025 14:16:00 +0400 Subject: [PATCH 20/21] test: Update tests to use new dataset structure --- tests/fixtures/services/economy_service.py | 46 ++++ tests/unit/services/test_economy_service.py | 237 +++++++++++--------- 2 files changed, 175 insertions(+), 108 deletions(-) diff --git a/tests/fixtures/services/economy_service.py b/tests/fixtures/services/economy_service.py index de043853..453d8d33 100644 --- a/tests/fixtures/services/economy_service.py +++ b/tests/fixtures/services/economy_service.py @@ -182,3 +182,49 @@ def mock_execution_states(): "ACTIVE": executions_v1.Execution.State.ACTIVE, "CANCELLED": executions_v1.Execution.State.CANCELLED, } + + +# Expected GCS paths from get_default_dataset +MOCK_US_NATIONWIDE_DATASET = "gs://policyengine-us-data/cps_2023.h5" +MOCK_US_STATE_CA_DATASET = "gs://policyengine-us-data/states/CA.h5" +MOCK_US_STATE_UT_DATASET = "gs://policyengine-us-data/states/UT.h5" +MOCK_US_CITY_NYC_DATASET = "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" +MOCK_US_DISTRICT_CA37_DATASET = "gs://policyengine-us-data/districts/CA-37.h5" +MOCK_UK_DATASET = "gs://policyengine-uk-data-private/enhanced_frs_2023_24.h5" + + +def mock_get_default_dataset_fn(country: str, region: str | None) -> str: + """Mock implementation of get_default_dataset for testing.""" + if country == "uk": + return MOCK_UK_DATASET + elif country == "us": + if region == "us" or region is None: + return MOCK_US_NATIONWIDE_DATASET + elif region == "state/ca": + return MOCK_US_STATE_CA_DATASET + elif region == "state/ut": + return MOCK_US_STATE_UT_DATASET + elif region == "city/nyc": + return MOCK_US_CITY_NYC_DATASET + elif region == "congressional_district/CA-37": + return MOCK_US_DISTRICT_CA37_DATASET + elif region.startswith("state/"): + # Generic state handling + state_code = region.split("/")[1].upper() + return f"gs://policyengine-us-data/states/{state_code}.h5" + elif region.startswith("congressional_district/"): + district_id = region.split("/")[1].upper() + return f"gs://policyengine-us-data/districts/{district_id}.h5" + else: + return MOCK_US_NATIONWIDE_DATASET + raise ValueError(f"Unknown country: {country}") + + +@pytest.fixture +def mock_get_default_dataset(): + """Mock get_default_dataset function.""" + with patch( + "policyengine_api.services.economy_service.get_default_dataset", + side_effect=mock_get_default_dataset_fn, + ) as mock: + yield mock diff --git a/tests/unit/services/test_economy_service.py b/tests/unit/services/test_economy_service.py index a76c0056..b1c2dc1e 100644 --- a/tests/unit/services/test_economy_service.py +++ b/tests/unit/services/test_economy_service.py @@ -522,6 +522,12 @@ def test__given_valid_data__creates_instance(self): assert options.options_hash == MOCK_OPTIONS_HASH class TestSetupSimOptions: + """Tests for _setup_sim_options method. + + Note: _setup_sim_options now expects pre-normalized regions and returns + GCS paths in the data field (not None). + """ + test_country_id = "us" test_reform_policy = json.dumps( {"sample_param": {"2024-01-01.2100-12-31": 15}} @@ -531,12 +537,9 @@ class TestSetupSimOptions: test_time_period = 2025 test_scope: Literal["macro"] = "macro" - def test__given_valid_options__returns_correct_sim_options(self): - - # Create an instance of the class + def test__given_us_nationwide__returns_correct_sim_options(self): service = EconomyService() - # Call the method with the test data sim_options_model = service._setup_sim_options( self.test_country_id, self.test_reform_policy, @@ -548,7 +551,6 @@ def test__given_valid_options__returns_correct_sim_options(self): sim_options = sim_options_model.model_dump() - # Assert the expected values in the returned dictionary assert sim_options["country"] == self.test_country_id assert sim_options["scope"] == self.test_scope assert sim_options["reform"] == json.loads(self.test_reform_policy) @@ -557,22 +559,20 @@ def test__given_valid_options__returns_correct_sim_options(self): ) assert sim_options["time_period"] == self.test_time_period assert sim_options["region"] == "us" - assert sim_options["data"] is None + assert sim_options["data"] == "gs://policyengine-us-data/cps_2023.h5" - def test__given_us_state__returns_correct_sim_options(self): - # Test with a US state + def test__given_us_state_ca__returns_correct_sim_options(self): + # Test with a normalized US state (prefixed format) country_id = "us" reform_policy = json.dumps( {"sample_param": {"2024-01-01.2100-12-31": 15}} ) current_law_baseline_policy = json.dumps({}) - region = "ca" + region = "state/ca" # Pre-normalized time_period = 2025 scope = "macro" - # Create an instance of the class service = EconomyService() - # Call the method sim_options_model = service._setup_sim_options( country_id, reform_policy, @@ -581,8 +581,8 @@ def test__given_us_state__returns_correct_sim_options(self): time_period, scope, ) - # Assert the expected values in the returned dictionary sim_options = sim_options_model.model_dump() + assert sim_options["country"] == country_id assert sim_options["scope"] == scope assert sim_options["reform"] == json.loads(reform_policy) @@ -591,22 +591,20 @@ def test__given_us_state__returns_correct_sim_options(self): ) assert sim_options["time_period"] == time_period assert sim_options["region"] == "state/ca" - assert sim_options["data"] is None + assert sim_options["data"] == "gs://policyengine-us-data/states/CA.h5" def test__given_us_state_utah__returns_correct_sim_options(self): - # Test with Utah state + # Test with normalized Utah state country_id = "us" reform_policy = json.dumps( {"sample_param": {"2024-01-01.2100-12-31": 15}} ) current_law_baseline_policy = json.dumps({}) - region = "ut" + region = "state/ut" # Pre-normalized time_period = 2025 scope = "macro" - # Create an instance of the class service = EconomyService() - # Call the method sim_options_model = service._setup_sim_options( country_id, reform_policy, @@ -616,7 +614,7 @@ def test__given_us_state_utah__returns_correct_sim_options(self): scope, ) sim_options = sim_options_model.model_dump() - # Assert the expected values in the returned dictionary + assert sim_options["country"] == country_id assert sim_options["scope"] == scope assert sim_options["reform"] == json.loads(reform_policy) @@ -625,7 +623,7 @@ def test__given_us_state_utah__returns_correct_sim_options(self): ) assert sim_options["time_period"] == time_period assert sim_options["region"] == "state/ut" - assert sim_options["data"] is None + assert sim_options["data"] == "gs://policyengine-us-data/states/UT.h5" def test__given_cliff_target__returns_correct_sim_options(self): country_id = "us" @@ -637,10 +635,8 @@ def test__given_cliff_target__returns_correct_sim_options(self): time_period = 2025 scope = "macro" - # Create an instance of the class service = EconomyService() - # Call the method sim_options_model = service._setup_sim_options( country_id, reform_policy, @@ -651,7 +647,6 @@ def test__given_cliff_target__returns_correct_sim_options(self): include_cliffs=True, ) - # Assert the expected values in the returned dictionary sim_options = sim_options_model.model_dump() assert sim_options["country"] == country_id assert sim_options["scope"] == scope @@ -661,42 +656,93 @@ def test__given_cliff_target__returns_correct_sim_options(self): ) assert sim_options["time_period"] == time_period assert sim_options["region"] == region - assert sim_options["data"] is None + assert sim_options["data"] == "gs://policyengine-us-data/cps_2023.h5" assert sim_options["include_cliffs"] is True - class TestSetupRegion: - def test__given_us_state__returns_correct_region(self): - # Test with a US state + def test__given_uk__returns_correct_sim_options(self): + country_id = "uk" + reform_policy = json.dumps( + {"sample_param": {"2024-01-01.2100-12-31": 15}} + ) + current_law_baseline_policy = json.dumps({}) + region = "uk" + time_period = 2025 + scope = "macro" + + service = EconomyService() + + sim_options_model = service._setup_sim_options( + country_id, + reform_policy, + current_law_baseline_policy, + region, + time_period, + scope, + ) + + sim_options = sim_options_model.model_dump() + assert sim_options["country"] == country_id + assert sim_options["region"] == region + assert ( + sim_options["data"] + == "gs://policyengine-uk-data-private/enhanced_frs_2023_24.h5" + ) + + def test__given_congressional_district__returns_correct_sim_options( + self, + ): country_id = "us" - # US states always lowercase two-letter codes - region = "ca" + reform_policy = json.dumps( + {"sample_param": {"2024-01-01.2100-12-31": 15}} + ) + current_law_baseline_policy = json.dumps({}) + region = "congressional_district/CA-37" # Pre-normalized + time_period = 2025 + scope = "macro" - # Create an instance of the class service = EconomyService() - # Call the method - result = service._setup_region(country_id, region) - # Assert the expected value - assert result == "state/ca" + sim_options_model = service._setup_sim_options( + country_id, + reform_policy, + current_law_baseline_policy, + region, + time_period, + scope, + ) - def test__given_non_us_state__returns_correct_region(self): - # Test with non-US region - country_id = "uk" - region = "country/england" + sim_options = sim_options_model.model_dump() + assert sim_options["region"] == "congressional_district/CA-37" + assert ( + sim_options["data"] + == "gs://policyengine-us-data/districts/CA-37.h5" + ) - # Create an instance of the class + class TestSetupRegion: + """Tests for _setup_region method. + + Note: _setup_region now only validates regions - it assumes normalization + has already been done by normalize_us_region() earlier in the flow. + """ + + def test__given_prefixed_us_state__returns_unchanged(self): + # Test with a normalized US state (prefixed format) + service = EconomyService() + result = service._setup_region("us", "state/ca") + assert result == "state/ca" + + def test__given_non_us_region__returns_unchanged(self): + # Test with non-US region - no validation performed service = EconomyService() - # Call the method - result = service._setup_region(country_id, region) - # Assert the expected value - assert result == region + result = service._setup_region("uk", "country/england") + assert result == "country/england" def test__given_us_national__returns_us(self): service = EconomyService() result = service._setup_region("us", "us") assert result == "us" - def test__given_prefixed_state__returns_unchanged(self): + def test__given_prefixed_state_tx__returns_unchanged(self): service = EconomyService() result = service._setup_region("us", "state/tx") assert result == "state/tx" @@ -717,12 +763,6 @@ def test__given_lowercase_congressional_district__returns_unchanged( ) assert result == "congressional_district/ca-37" - def test__given_invalid_state__raises_value_error(self): - service = EconomyService() - with pytest.raises(ValueError) as exc_info: - service._setup_region("us", "mb") # Manitoba is Canadian - assert "Invalid US region: 'mb'" in str(exc_info.value) - def test__given_invalid_prefixed_state__raises_value_error(self): service = EconomyService() with pytest.raises(ValueError) as exc_info: @@ -748,93 +788,74 @@ def test__given_invalid_prefix__raises_value_error(self): ) def test__given_invalid_bare_value__raises_value_error(self): + # Bare values without prefix are now invalid (should be normalized first) service = EconomyService() with pytest.raises(ValueError) as exc_info: service._setup_region("us", "invalid_value") assert "Invalid US region: 'invalid_value'" in str(exc_info.value) - def test__given_legacy_nyc__returns_city_nyc(self): - # Test legacy "nyc" format gets converted to "city/nyc" - service = EconomyService() - result = service._setup_region("us", "nyc") - assert result == "city/nyc" - def test__given_city_nyc__returns_unchanged(self): - # Test new "city/nyc" format passes through unchanged + # Test normalized "city/nyc" format passes through service = EconomyService() result = service._setup_region("us", "city/nyc") assert result == "city/nyc" class TestSetupData: - def test__given_legacy_nyc_region__returns_pooled_cps(self): - # Test with legacy NYC region format - should return pooled CPS dataset - region = "nyc" + """Tests for _setup_data method. - # Create an instance of the class - service = EconomyService() - # Call the method - result = service._setup_data(region) - # Assert the expected value - assert ( - result == "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" - ) + Note: _setup_data now uses get_default_dataset from policyengine package + to return GCS paths for all region types (not None). + """ - def test__given_city_nyc_region__returns_pooled_cps(self): - # Test with new city/nyc region format - should return pooled CPS dataset - region = "city/nyc" - - # Create an instance of the class + def test__given_us_city_nyc__returns_pooled_cps(self): + # Test with normalized city/nyc format service = EconomyService() - # Call the method - result = service._setup_data(region) - # Assert the expected value + result = service._setup_data("us", "city/nyc") assert ( result == "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" ) - def test__given_us_state__returns_none(self): - # Test with US state - should return None - region = "ca" + def test__given_us_state_ca__returns_state_dataset(self): + # Test with US state - returns state-specific dataset + service = EconomyService() + result = service._setup_data("us", "state/ca") + assert result == "gs://policyengine-us-data/states/CA.h5" - # Create an instance of the class + def test__given_us_state_ut__returns_state_dataset(self): + # Test with Utah state - returns state-specific dataset service = EconomyService() - # Call the method - result = service._setup_data(region) - # Assert the expected value - assert result is None + result = service._setup_data("us", "state/ut") + assert result == "gs://policyengine-us-data/states/UT.h5" - def test__given_us_nationwide__returns_none(self): + def test__given_us_nationwide__returns_cps_dataset(self): # Test with US nationwide region - region = "us" - - # Create an instance of the class service = EconomyService() - # Call the method - result = service._setup_data(region) - # Assert the expected value - assert result is None - - def test__given_congressional_district__returns_none(self): - # Test with congressional district - should return None - region = "congressional_district/TX-14" + result = service._setup_data("us", "us") + assert result == "gs://policyengine-us-data/cps_2023.h5" - # Create an instance of the class + def test__given_congressional_district__returns_district_dataset(self): + # Test with congressional district - returns district-specific dataset service = EconomyService() - # Call the method - result = service._setup_data(region) - # Assert the expected value - assert result is None + result = service._setup_data( + "us", "congressional_district/CA-37" + ) + assert result == "gs://policyengine-us-data/districts/CA-37.h5" - def test__given_state_prefix__returns_none(self): - # Test with prefixed state - should return None - region = "state/ut" + def test__given_uk__returns_efrs_dataset(self): + # Test with UK - returns enhanced FRS dataset + service = EconomyService() + result = service._setup_data("uk", "uk") + assert ( + result + == "gs://policyengine-uk-data-private/enhanced_frs_2023_24.h5" + ) - # Create an instance of the class + def test__given_invalid_country__raises_value_error(self, mock_logger): + # Test with invalid country service = EconomyService() - # Call the method - result = service._setup_data(region) - # Assert the expected value - assert result is None + with pytest.raises(ValueError) as exc_info: + service._setup_data("invalid", "region") + assert "invalid" in str(exc_info.value).lower() class TestValidateUsRegion: """Tests for the _validate_us_region method.""" From 1c10af202c78fe2d51cc8652ad69745cb805471b Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 11 Dec 2025 14:48:09 +0400 Subject: [PATCH 21/21] chore: Lint --- policyengine_api/services/economy_service.py | 4 +--- tests/fixtures/services/economy_service.py | 4 +++- tests/unit/services/test_economy_service.py | 20 +++++++++++++------- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/policyengine_api/services/economy_service.py b/policyengine_api/services/economy_service.py index 9f3aa042..68b86194 100644 --- a/policyengine_api/services/economy_service.py +++ b/policyengine_api/services/economy_service.py @@ -464,9 +464,7 @@ def _setup_sim_options( "region": self._setup_region( country_id=country_id, region=region ), - "data": self._setup_data( - country_id=country_id, region=region - ), + "data": self._setup_data(country_id=country_id, region=region), "model_version": model_version, "data_version": data_version, } diff --git a/tests/fixtures/services/economy_service.py b/tests/fixtures/services/economy_service.py index 453d8d33..83068bd0 100644 --- a/tests/fixtures/services/economy_service.py +++ b/tests/fixtures/services/economy_service.py @@ -188,7 +188,9 @@ def mock_execution_states(): MOCK_US_NATIONWIDE_DATASET = "gs://policyengine-us-data/cps_2023.h5" MOCK_US_STATE_CA_DATASET = "gs://policyengine-us-data/states/CA.h5" MOCK_US_STATE_UT_DATASET = "gs://policyengine-us-data/states/UT.h5" -MOCK_US_CITY_NYC_DATASET = "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" +MOCK_US_CITY_NYC_DATASET = ( + "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" +) MOCK_US_DISTRICT_CA37_DATASET = "gs://policyengine-us-data/districts/CA-37.h5" MOCK_UK_DATASET = "gs://policyengine-uk-data-private/enhanced_frs_2023_24.h5" diff --git a/tests/unit/services/test_economy_service.py b/tests/unit/services/test_economy_service.py index b1c2dc1e..d870a627 100644 --- a/tests/unit/services/test_economy_service.py +++ b/tests/unit/services/test_economy_service.py @@ -559,7 +559,9 @@ def test__given_us_nationwide__returns_correct_sim_options(self): ) assert sim_options["time_period"] == self.test_time_period assert sim_options["region"] == "us" - assert sim_options["data"] == "gs://policyengine-us-data/cps_2023.h5" + assert ( + sim_options["data"] == "gs://policyengine-us-data/cps_2023.h5" + ) def test__given_us_state_ca__returns_correct_sim_options(self): # Test with a normalized US state (prefixed format) @@ -591,7 +593,9 @@ def test__given_us_state_ca__returns_correct_sim_options(self): ) assert sim_options["time_period"] == time_period assert sim_options["region"] == "state/ca" - assert sim_options["data"] == "gs://policyengine-us-data/states/CA.h5" + assert ( + sim_options["data"] == "gs://policyengine-us-data/states/CA.h5" + ) def test__given_us_state_utah__returns_correct_sim_options(self): # Test with normalized Utah state @@ -623,7 +627,9 @@ def test__given_us_state_utah__returns_correct_sim_options(self): ) assert sim_options["time_period"] == time_period assert sim_options["region"] == "state/ut" - assert sim_options["data"] == "gs://policyengine-us-data/states/UT.h5" + assert ( + sim_options["data"] == "gs://policyengine-us-data/states/UT.h5" + ) def test__given_cliff_target__returns_correct_sim_options(self): country_id = "us" @@ -656,7 +662,9 @@ def test__given_cliff_target__returns_correct_sim_options(self): ) assert sim_options["time_period"] == time_period assert sim_options["region"] == region - assert sim_options["data"] == "gs://policyengine-us-data/cps_2023.h5" + assert ( + sim_options["data"] == "gs://policyengine-us-data/cps_2023.h5" + ) assert sim_options["include_cliffs"] is True def test__given_uk__returns_correct_sim_options(self): @@ -836,9 +844,7 @@ def test__given_us_nationwide__returns_cps_dataset(self): def test__given_congressional_district__returns_district_dataset(self): # Test with congressional district - returns district-specific dataset service = EconomyService() - result = service._setup_data( - "us", "congressional_district/CA-37" - ) + result = service._setup_data("us", "congressional_district/CA-37") assert result == "gs://policyengine-us-data/districts/CA-37.h5" def test__given_uk__returns_efrs_dataset(self):