diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..c83a0bbf 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,7 @@ +- bump: patch + changes: + added: + - US congressional district metadata + changed: + - US simulations use default datasets from .py + - Upgraded .py version to 0.7.0 \ No newline at end of file diff --git a/policyengine_api/constants.py b/policyengine_api/constants.py index 33f28466..db87e948 100644 --- a/policyengine_api/constants.py +++ b/policyengine_api/constants.py @@ -24,4 +24,34 @@ } except: COUNTRY_PACKAGE_VERSIONS = {country: "0.0.0" for country in COUNTRIES} + +# Valid region types for each country +# These define the geographic scope categories for regions +US_REGION_TYPES = ( + "national", # National level (e.g., "us") + "state", # US states (e.g., "state/ca", "state/ny") + "city", # US cities (e.g., "city/nyc") + "congressional_district", # US congressional districts (e.g., "congressional_district/CA-37") +) + +UK_REGION_TYPES = ( + "national", # National level (e.g., "uk") + "country", # UK countries (e.g., "country/england", "country/scotland") + "constituency", # UK parliamentary constituencies (e.g., "constituency/Aldershot") +) + +# Valid region prefixes for each country +# These define the allowed geographic scope prefixes in region names +REGION_PREFIXES = { + "us": [ + "state/", # US states (e.g., "state/ca", "state/ny") + "city/", # US cities (e.g., "city/nyc") + "congressional_district/", # US congressional districts (e.g., "congressional_district/CA-37") + ], + "uk": [ + "country/", # UK countries (e.g., "country/england", "country/scotland") + "constituency/", # UK parliamentary constituencies (e.g., "constituency/Aldershot") + ], +} + __version__ = VERSION diff --git a/policyengine_api/country.py b/policyengine_api/country.py index af2d8ec4..44cb4747 100644 --- a/policyengine_api/country.py +++ b/policyengine_api/country.py @@ -18,6 +18,9 @@ import math import pandas as pd from pathlib import Path +from policyengine_api.data.congressional_districts import ( + build_congressional_district_metadata, +) # Note: The following policyengine_[xx] imports are probably redundant. # These modules are imported dynamically in the __init__ function below. @@ -71,17 +74,22 @@ def build_microsimulation_options(self) -> dict: ) constituency_names = pd.read_csv(constituency_names_path) region = [ - dict(name="uk", label="the UK"), - dict(name="country/england", label="England"), - dict(name="country/scotland", label="Scotland"), - dict(name="country/wales", label="Wales"), - dict(name="country/ni", label="Northern Ireland"), + dict(name="uk", label="the UK", type="national"), + dict(name="country/england", label="England", type="country"), + dict( + name="country/scotland", label="Scotland", type="country" + ), + dict(name="country/wales", label="Wales", type="country"), + dict( + name="country/ni", label="Northern Ireland", type="country" + ), ] for i in range(len(constituency_names)): region.append( dict( name=f"constituency/{constituency_names.iloc[i]['name']}", label=constituency_names.iloc[i]["name"], + type="constituency", ) ) time_period = [ @@ -99,60 +107,64 @@ def build_microsimulation_options(self) -> dict: options["datasets"] = datasets elif self.country_id == "us": region = [ - dict(name="us", label="the US"), - dict(name="al", label="Alabama"), - dict(name="ak", label="Alaska"), - dict(name="az", label="Arizona"), - dict(name="ar", label="Arkansas"), - dict(name="ca", label="California"), - dict(name="co", label="Colorado"), - dict(name="ct", label="Connecticut"), - dict(name="de", label="Delaware"), - dict(name="dc", label="District of Columbia"), - dict(name="fl", label="Florida"), - dict(name="ga", label="Georgia"), - dict(name="hi", label="Hawaii"), - dict(name="id", label="Idaho"), - dict(name="il", label="Illinois"), - dict(name="in", label="Indiana"), - dict(name="ia", label="Iowa"), - dict(name="ks", label="Kansas"), - dict(name="ky", label="Kentucky"), - dict(name="la", label="Louisiana"), - dict(name="me", label="Maine"), - dict(name="md", label="Maryland"), - dict(name="ma", label="Massachusetts"), - dict(name="mi", label="Michigan"), - dict(name="mn", label="Minnesota"), - dict(name="ms", label="Mississippi"), - dict(name="mo", label="Missouri"), - dict(name="mt", label="Montana"), - dict(name="ne", label="Nebraska"), - dict(name="nv", label="Nevada"), - dict(name="nh", label="New Hampshire"), - dict(name="nj", label="New Jersey"), - dict(name="nm", label="New Mexico"), - dict(name="ny", label="New York"), - dict(name="nyc", label="New York City"), # Region, not State - dict(name="nc", label="North Carolina"), - dict(name="nd", label="North Dakota"), - dict(name="oh", label="Ohio"), - dict(name="ok", label="Oklahoma"), - dict(name="or", label="Oregon"), - dict(name="pa", label="Pennsylvania"), - dict(name="ri", label="Rhode Island"), - dict(name="sc", label="South Carolina"), - dict(name="sd", label="South Dakota"), - dict(name="tn", label="Tennessee"), - dict(name="tx", label="Texas"), - dict(name="ut", label="Utah"), - dict(name="vt", label="Vermont"), - dict(name="va", label="Virginia"), - dict(name="wa", label="Washington"), - dict(name="wv", label="West Virginia"), - dict(name="wi", label="Wisconsin"), - dict(name="wy", label="Wyoming"), + dict(name="us", label="the US", type="national"), + dict(name="state/al", label="Alabama", type="state"), + dict(name="state/ak", label="Alaska", type="state"), + dict(name="state/az", label="Arizona", type="state"), + dict(name="state/ar", label="Arkansas", type="state"), + dict(name="state/ca", label="California", type="state"), + dict(name="state/co", label="Colorado", type="state"), + dict(name="state/ct", label="Connecticut", type="state"), + dict(name="state/de", label="Delaware", type="state"), + dict( + name="state/dc", label="District of Columbia", type="state" + ), + dict(name="state/fl", label="Florida", type="state"), + dict(name="state/ga", label="Georgia", type="state"), + dict(name="state/hi", label="Hawaii", type="state"), + dict(name="state/id", label="Idaho", type="state"), + dict(name="state/il", label="Illinois", type="state"), + dict(name="state/in", label="Indiana", type="state"), + dict(name="state/ia", label="Iowa", type="state"), + dict(name="state/ks", label="Kansas", type="state"), + dict(name="state/ky", label="Kentucky", type="state"), + dict(name="state/la", label="Louisiana", type="state"), + dict(name="state/me", label="Maine", type="state"), + dict(name="state/md", label="Maryland", type="state"), + dict(name="state/ma", label="Massachusetts", type="state"), + dict(name="state/mi", label="Michigan", type="state"), + dict(name="state/mn", label="Minnesota", type="state"), + dict(name="state/ms", label="Mississippi", type="state"), + dict(name="state/mo", label="Missouri", type="state"), + dict(name="state/mt", label="Montana", type="state"), + dict(name="state/ne", label="Nebraska", type="state"), + dict(name="state/nv", label="Nevada", type="state"), + dict(name="state/nh", label="New Hampshire", type="state"), + dict(name="state/nj", label="New Jersey", type="state"), + dict(name="state/nm", label="New Mexico", type="state"), + dict(name="state/ny", label="New York", type="state"), + dict(name="city/nyc", label="New York City", type="city"), + dict(name="state/nc", label="North Carolina", type="state"), + dict(name="state/nd", label="North Dakota", type="state"), + dict(name="state/oh", label="Ohio", type="state"), + dict(name="state/ok", label="Oklahoma", type="state"), + dict(name="state/or", label="Oregon", type="state"), + dict(name="state/pa", label="Pennsylvania", type="state"), + dict(name="state/ri", label="Rhode Island", type="state"), + dict(name="state/sc", label="South Carolina", type="state"), + dict(name="state/sd", label="South Dakota", type="state"), + dict(name="state/tn", label="Tennessee", type="state"), + dict(name="state/tx", label="Texas", type="state"), + dict(name="state/ut", label="Utah", type="state"), + dict(name="state/vt", label="Vermont", type="state"), + dict(name="state/va", label="Virginia", type="state"), + dict(name="state/wa", label="Washington", type="state"), + dict(name="state/wv", label="West Virginia", type="state"), + dict(name="state/wi", label="Wisconsin", type="state"), + dict(name="state/wy", label="Wyoming", type="state"), ] + # Add all 436 congressional districts (435 voting + DC) + region.extend(build_congressional_district_metadata()) time_period = [ dict(name=2035, label="2035"), dict(name=2034, label="2034"), diff --git a/policyengine_api/data/congressional_districts.py b/policyengine_api/data/congressional_districts.py new file mode 100644 index 00000000..7aa54ab8 --- /dev/null +++ b/policyengine_api/data/congressional_districts.py @@ -0,0 +1,767 @@ +""" +US Congressional District Metadata + +This module defines the metadata for all 435 US Congressional districts +based on the 2020 Census apportionment, effective for the 118th Congress +(2023-2025) and continuing through the 119th Congress (2025-2027). + +Source: https://ballotpedia.org/Congressional_apportionment_after_the_2020_census +Census Data: https://www.census.gov/data/tables/2020/dec/2020-apportionment-data.html +""" + +from pydantic import BaseModel, Field + + +# Mapping of state codes to full state names +STATE_CODE_TO_NAME = { + "AL": "Alabama", + "AK": "Alaska", + "AZ": "Arizona", + "AR": "Arkansas", + "CA": "California", + "CO": "Colorado", + "CT": "Connecticut", + "DE": "Delaware", + "DC": "District of Columbia", + "FL": "Florida", + "GA": "Georgia", + "HI": "Hawaii", + "ID": "Idaho", + "IL": "Illinois", + "IN": "Indiana", + "IA": "Iowa", + "KS": "Kansas", + "KY": "Kentucky", + "LA": "Louisiana", + "ME": "Maine", + "MD": "Maryland", + "MA": "Massachusetts", + "MI": "Michigan", + "MN": "Minnesota", + "MS": "Mississippi", + "MO": "Missouri", + "MT": "Montana", + "NE": "Nebraska", + "NV": "Nevada", + "NH": "New Hampshire", + "NJ": "New Jersey", + "NM": "New Mexico", + "NY": "New York", + "NC": "North Carolina", + "ND": "North Dakota", + "OH": "Ohio", + "OK": "Oklahoma", + "OR": "Oregon", + "PA": "Pennsylvania", + "RI": "Rhode Island", + "SC": "South Carolina", + "SD": "South Dakota", + "TN": "Tennessee", + "TX": "Texas", + "UT": "Utah", + "VT": "Vermont", + "VA": "Virginia", + "WA": "Washington", + "WV": "West Virginia", + "WI": "Wisconsin", + "WY": "Wyoming", +} + + +class CongressionalDistrictMetadataItem(BaseModel): + """ + Metadata for a single US Congressional district. + + Uses Pydantic BaseModel for: + - Runtime validation of data integrity + - Automatic serialization/deserialization + - Consistency with existing codebase patterns (see policyengine_api/endpoints/economy/compare.py) + - Self-documenting schema with type hints + """ + + state_code: str = Field( + ..., + description="Two-letter US state code (uppercase)", + min_length=2, + max_length=2, + pattern="^[A-Z]{2}$", + ) + number: int = Field( + ..., + description="Congressional district number (1 for at-large districts)", + ge=1, + ) + + +# States with only one at-large congressional district +AT_LARGE_STATES: set[str] = {"AK", "DE", "DC", "ND", "SD", "VT", "WY"} + +# All 435 US Congressional districts based on 2020 Census apportionment +CONGRESSIONAL_DISTRICTS: list[CongressionalDistrictMetadataItem] = [ + # Alabama - 7 districts + CongressionalDistrictMetadataItem(state_code="AL", number=1), + CongressionalDistrictMetadataItem(state_code="AL", number=2), + CongressionalDistrictMetadataItem(state_code="AL", number=3), + CongressionalDistrictMetadataItem(state_code="AL", number=4), + CongressionalDistrictMetadataItem(state_code="AL", number=5), + CongressionalDistrictMetadataItem(state_code="AL", number=6), + CongressionalDistrictMetadataItem(state_code="AL", number=7), + # Alaska - 1 at-large district + CongressionalDistrictMetadataItem(state_code="AK", number=1), + # Arizona - 9 districts + CongressionalDistrictMetadataItem(state_code="AZ", number=1), + CongressionalDistrictMetadataItem(state_code="AZ", number=2), + CongressionalDistrictMetadataItem(state_code="AZ", number=3), + CongressionalDistrictMetadataItem(state_code="AZ", number=4), + CongressionalDistrictMetadataItem(state_code="AZ", number=5), + CongressionalDistrictMetadataItem(state_code="AZ", number=6), + CongressionalDistrictMetadataItem(state_code="AZ", number=7), + CongressionalDistrictMetadataItem(state_code="AZ", number=8), + CongressionalDistrictMetadataItem(state_code="AZ", number=9), + # Arkansas - 4 districts + CongressionalDistrictMetadataItem(state_code="AR", number=1), + CongressionalDistrictMetadataItem(state_code="AR", number=2), + CongressionalDistrictMetadataItem(state_code="AR", number=3), + CongressionalDistrictMetadataItem(state_code="AR", number=4), + # California - 52 districts + CongressionalDistrictMetadataItem(state_code="CA", number=1), + CongressionalDistrictMetadataItem(state_code="CA", number=2), + CongressionalDistrictMetadataItem(state_code="CA", number=3), + CongressionalDistrictMetadataItem(state_code="CA", number=4), + CongressionalDistrictMetadataItem(state_code="CA", number=5), + CongressionalDistrictMetadataItem(state_code="CA", number=6), + CongressionalDistrictMetadataItem(state_code="CA", number=7), + CongressionalDistrictMetadataItem(state_code="CA", number=8), + CongressionalDistrictMetadataItem(state_code="CA", number=9), + CongressionalDistrictMetadataItem(state_code="CA", number=10), + CongressionalDistrictMetadataItem(state_code="CA", number=11), + CongressionalDistrictMetadataItem(state_code="CA", number=12), + CongressionalDistrictMetadataItem(state_code="CA", number=13), + CongressionalDistrictMetadataItem(state_code="CA", number=14), + CongressionalDistrictMetadataItem(state_code="CA", number=15), + CongressionalDistrictMetadataItem(state_code="CA", number=16), + CongressionalDistrictMetadataItem(state_code="CA", number=17), + CongressionalDistrictMetadataItem(state_code="CA", number=18), + CongressionalDistrictMetadataItem(state_code="CA", number=19), + CongressionalDistrictMetadataItem(state_code="CA", number=20), + CongressionalDistrictMetadataItem(state_code="CA", number=21), + CongressionalDistrictMetadataItem(state_code="CA", number=22), + CongressionalDistrictMetadataItem(state_code="CA", number=23), + CongressionalDistrictMetadataItem(state_code="CA", number=24), + CongressionalDistrictMetadataItem(state_code="CA", number=25), + CongressionalDistrictMetadataItem(state_code="CA", number=26), + CongressionalDistrictMetadataItem(state_code="CA", number=27), + CongressionalDistrictMetadataItem(state_code="CA", number=28), + CongressionalDistrictMetadataItem(state_code="CA", number=29), + CongressionalDistrictMetadataItem(state_code="CA", number=30), + CongressionalDistrictMetadataItem(state_code="CA", number=31), + CongressionalDistrictMetadataItem(state_code="CA", number=32), + CongressionalDistrictMetadataItem(state_code="CA", number=33), + CongressionalDistrictMetadataItem(state_code="CA", number=34), + CongressionalDistrictMetadataItem(state_code="CA", number=35), + CongressionalDistrictMetadataItem(state_code="CA", number=36), + CongressionalDistrictMetadataItem(state_code="CA", number=37), + CongressionalDistrictMetadataItem(state_code="CA", number=38), + CongressionalDistrictMetadataItem(state_code="CA", number=39), + CongressionalDistrictMetadataItem(state_code="CA", number=40), + CongressionalDistrictMetadataItem(state_code="CA", number=41), + CongressionalDistrictMetadataItem(state_code="CA", number=42), + CongressionalDistrictMetadataItem(state_code="CA", number=43), + CongressionalDistrictMetadataItem(state_code="CA", number=44), + CongressionalDistrictMetadataItem(state_code="CA", number=45), + CongressionalDistrictMetadataItem(state_code="CA", number=46), + CongressionalDistrictMetadataItem(state_code="CA", number=47), + CongressionalDistrictMetadataItem(state_code="CA", number=48), + CongressionalDistrictMetadataItem(state_code="CA", number=49), + CongressionalDistrictMetadataItem(state_code="CA", number=50), + CongressionalDistrictMetadataItem(state_code="CA", number=51), + CongressionalDistrictMetadataItem(state_code="CA", number=52), + # Colorado - 8 districts + CongressionalDistrictMetadataItem(state_code="CO", number=1), + CongressionalDistrictMetadataItem(state_code="CO", number=2), + CongressionalDistrictMetadataItem(state_code="CO", number=3), + CongressionalDistrictMetadataItem(state_code="CO", number=4), + CongressionalDistrictMetadataItem(state_code="CO", number=5), + CongressionalDistrictMetadataItem(state_code="CO", number=6), + CongressionalDistrictMetadataItem(state_code="CO", number=7), + CongressionalDistrictMetadataItem(state_code="CO", number=8), + # Connecticut - 5 districts + CongressionalDistrictMetadataItem(state_code="CT", number=1), + CongressionalDistrictMetadataItem(state_code="CT", number=2), + CongressionalDistrictMetadataItem(state_code="CT", number=3), + CongressionalDistrictMetadataItem(state_code="CT", number=4), + CongressionalDistrictMetadataItem(state_code="CT", number=5), + # Delaware - 1 at-large district + CongressionalDistrictMetadataItem(state_code="DE", number=1), + # District of Columbia - 1 non-voting delegate + CongressionalDistrictMetadataItem(state_code="DC", number=1), + # Florida - 28 districts + CongressionalDistrictMetadataItem(state_code="FL", number=1), + CongressionalDistrictMetadataItem(state_code="FL", number=2), + CongressionalDistrictMetadataItem(state_code="FL", number=3), + CongressionalDistrictMetadataItem(state_code="FL", number=4), + CongressionalDistrictMetadataItem(state_code="FL", number=5), + CongressionalDistrictMetadataItem(state_code="FL", number=6), + CongressionalDistrictMetadataItem(state_code="FL", number=7), + CongressionalDistrictMetadataItem(state_code="FL", number=8), + CongressionalDistrictMetadataItem(state_code="FL", number=9), + CongressionalDistrictMetadataItem(state_code="FL", number=10), + CongressionalDistrictMetadataItem(state_code="FL", number=11), + CongressionalDistrictMetadataItem(state_code="FL", number=12), + CongressionalDistrictMetadataItem(state_code="FL", number=13), + CongressionalDistrictMetadataItem(state_code="FL", number=14), + CongressionalDistrictMetadataItem(state_code="FL", number=15), + CongressionalDistrictMetadataItem(state_code="FL", number=16), + CongressionalDistrictMetadataItem(state_code="FL", number=17), + CongressionalDistrictMetadataItem(state_code="FL", number=18), + CongressionalDistrictMetadataItem(state_code="FL", number=19), + CongressionalDistrictMetadataItem(state_code="FL", number=20), + CongressionalDistrictMetadataItem(state_code="FL", number=21), + CongressionalDistrictMetadataItem(state_code="FL", number=22), + CongressionalDistrictMetadataItem(state_code="FL", number=23), + CongressionalDistrictMetadataItem(state_code="FL", number=24), + CongressionalDistrictMetadataItem(state_code="FL", number=25), + CongressionalDistrictMetadataItem(state_code="FL", number=26), + CongressionalDistrictMetadataItem(state_code="FL", number=27), + CongressionalDistrictMetadataItem(state_code="FL", number=28), + # Georgia - 14 districts + CongressionalDistrictMetadataItem(state_code="GA", number=1), + CongressionalDistrictMetadataItem(state_code="GA", number=2), + CongressionalDistrictMetadataItem(state_code="GA", number=3), + CongressionalDistrictMetadataItem(state_code="GA", number=4), + CongressionalDistrictMetadataItem(state_code="GA", number=5), + CongressionalDistrictMetadataItem(state_code="GA", number=6), + CongressionalDistrictMetadataItem(state_code="GA", number=7), + CongressionalDistrictMetadataItem(state_code="GA", number=8), + CongressionalDistrictMetadataItem(state_code="GA", number=9), + CongressionalDistrictMetadataItem(state_code="GA", number=10), + CongressionalDistrictMetadataItem(state_code="GA", number=11), + CongressionalDistrictMetadataItem(state_code="GA", number=12), + CongressionalDistrictMetadataItem(state_code="GA", number=13), + CongressionalDistrictMetadataItem(state_code="GA", number=14), + # Hawaii - 2 districts + CongressionalDistrictMetadataItem(state_code="HI", number=1), + CongressionalDistrictMetadataItem(state_code="HI", number=2), + # Idaho - 2 districts + CongressionalDistrictMetadataItem(state_code="ID", number=1), + CongressionalDistrictMetadataItem(state_code="ID", number=2), + # Illinois - 17 districts + CongressionalDistrictMetadataItem(state_code="IL", number=1), + CongressionalDistrictMetadataItem(state_code="IL", number=2), + CongressionalDistrictMetadataItem(state_code="IL", number=3), + CongressionalDistrictMetadataItem(state_code="IL", number=4), + CongressionalDistrictMetadataItem(state_code="IL", number=5), + CongressionalDistrictMetadataItem(state_code="IL", number=6), + CongressionalDistrictMetadataItem(state_code="IL", number=7), + CongressionalDistrictMetadataItem(state_code="IL", number=8), + CongressionalDistrictMetadataItem(state_code="IL", number=9), + CongressionalDistrictMetadataItem(state_code="IL", number=10), + CongressionalDistrictMetadataItem(state_code="IL", number=11), + CongressionalDistrictMetadataItem(state_code="IL", number=12), + CongressionalDistrictMetadataItem(state_code="IL", number=13), + CongressionalDistrictMetadataItem(state_code="IL", number=14), + CongressionalDistrictMetadataItem(state_code="IL", number=15), + CongressionalDistrictMetadataItem(state_code="IL", number=16), + CongressionalDistrictMetadataItem(state_code="IL", number=17), + # Indiana - 9 districts + CongressionalDistrictMetadataItem(state_code="IN", number=1), + CongressionalDistrictMetadataItem(state_code="IN", number=2), + CongressionalDistrictMetadataItem(state_code="IN", number=3), + CongressionalDistrictMetadataItem(state_code="IN", number=4), + CongressionalDistrictMetadataItem(state_code="IN", number=5), + CongressionalDistrictMetadataItem(state_code="IN", number=6), + CongressionalDistrictMetadataItem(state_code="IN", number=7), + CongressionalDistrictMetadataItem(state_code="IN", number=8), + CongressionalDistrictMetadataItem(state_code="IN", number=9), + # Iowa - 4 districts + CongressionalDistrictMetadataItem(state_code="IA", number=1), + CongressionalDistrictMetadataItem(state_code="IA", number=2), + CongressionalDistrictMetadataItem(state_code="IA", number=3), + CongressionalDistrictMetadataItem(state_code="IA", number=4), + # Kansas - 4 districts + CongressionalDistrictMetadataItem(state_code="KS", number=1), + CongressionalDistrictMetadataItem(state_code="KS", number=2), + CongressionalDistrictMetadataItem(state_code="KS", number=3), + CongressionalDistrictMetadataItem(state_code="KS", number=4), + # Kentucky - 6 districts + CongressionalDistrictMetadataItem(state_code="KY", number=1), + CongressionalDistrictMetadataItem(state_code="KY", number=2), + CongressionalDistrictMetadataItem(state_code="KY", number=3), + CongressionalDistrictMetadataItem(state_code="KY", number=4), + CongressionalDistrictMetadataItem(state_code="KY", number=5), + CongressionalDistrictMetadataItem(state_code="KY", number=6), + # Louisiana - 6 districts + CongressionalDistrictMetadataItem(state_code="LA", number=1), + CongressionalDistrictMetadataItem(state_code="LA", number=2), + CongressionalDistrictMetadataItem(state_code="LA", number=3), + CongressionalDistrictMetadataItem(state_code="LA", number=4), + CongressionalDistrictMetadataItem(state_code="LA", number=5), + CongressionalDistrictMetadataItem(state_code="LA", number=6), + # Maine - 2 districts + CongressionalDistrictMetadataItem(state_code="ME", number=1), + CongressionalDistrictMetadataItem(state_code="ME", number=2), + # Maryland - 8 districts + CongressionalDistrictMetadataItem(state_code="MD", number=1), + CongressionalDistrictMetadataItem(state_code="MD", number=2), + CongressionalDistrictMetadataItem(state_code="MD", number=3), + CongressionalDistrictMetadataItem(state_code="MD", number=4), + CongressionalDistrictMetadataItem(state_code="MD", number=5), + CongressionalDistrictMetadataItem(state_code="MD", number=6), + CongressionalDistrictMetadataItem(state_code="MD", number=7), + CongressionalDistrictMetadataItem(state_code="MD", number=8), + # Massachusetts - 9 districts + CongressionalDistrictMetadataItem(state_code="MA", number=1), + CongressionalDistrictMetadataItem(state_code="MA", number=2), + CongressionalDistrictMetadataItem(state_code="MA", number=3), + CongressionalDistrictMetadataItem(state_code="MA", number=4), + CongressionalDistrictMetadataItem(state_code="MA", number=5), + CongressionalDistrictMetadataItem(state_code="MA", number=6), + CongressionalDistrictMetadataItem(state_code="MA", number=7), + CongressionalDistrictMetadataItem(state_code="MA", number=8), + CongressionalDistrictMetadataItem(state_code="MA", number=9), + # Michigan - 13 districts + CongressionalDistrictMetadataItem(state_code="MI", number=1), + CongressionalDistrictMetadataItem(state_code="MI", number=2), + CongressionalDistrictMetadataItem(state_code="MI", number=3), + CongressionalDistrictMetadataItem(state_code="MI", number=4), + CongressionalDistrictMetadataItem(state_code="MI", number=5), + CongressionalDistrictMetadataItem(state_code="MI", number=6), + CongressionalDistrictMetadataItem(state_code="MI", number=7), + CongressionalDistrictMetadataItem(state_code="MI", number=8), + CongressionalDistrictMetadataItem(state_code="MI", number=9), + CongressionalDistrictMetadataItem(state_code="MI", number=10), + CongressionalDistrictMetadataItem(state_code="MI", number=11), + CongressionalDistrictMetadataItem(state_code="MI", number=12), + CongressionalDistrictMetadataItem(state_code="MI", number=13), + # Minnesota - 8 districts + CongressionalDistrictMetadataItem(state_code="MN", number=1), + CongressionalDistrictMetadataItem(state_code="MN", number=2), + CongressionalDistrictMetadataItem(state_code="MN", number=3), + CongressionalDistrictMetadataItem(state_code="MN", number=4), + CongressionalDistrictMetadataItem(state_code="MN", number=5), + CongressionalDistrictMetadataItem(state_code="MN", number=6), + CongressionalDistrictMetadataItem(state_code="MN", number=7), + CongressionalDistrictMetadataItem(state_code="MN", number=8), + # Mississippi - 4 districts + CongressionalDistrictMetadataItem(state_code="MS", number=1), + CongressionalDistrictMetadataItem(state_code="MS", number=2), + CongressionalDistrictMetadataItem(state_code="MS", number=3), + CongressionalDistrictMetadataItem(state_code="MS", number=4), + # Missouri - 8 districts + CongressionalDistrictMetadataItem(state_code="MO", number=1), + CongressionalDistrictMetadataItem(state_code="MO", number=2), + CongressionalDistrictMetadataItem(state_code="MO", number=3), + CongressionalDistrictMetadataItem(state_code="MO", number=4), + CongressionalDistrictMetadataItem(state_code="MO", number=5), + CongressionalDistrictMetadataItem(state_code="MO", number=6), + CongressionalDistrictMetadataItem(state_code="MO", number=7), + CongressionalDistrictMetadataItem(state_code="MO", number=8), + # Montana - 2 districts + CongressionalDistrictMetadataItem(state_code="MT", number=1), + CongressionalDistrictMetadataItem(state_code="MT", number=2), + # Nebraska - 3 districts + CongressionalDistrictMetadataItem(state_code="NE", number=1), + CongressionalDistrictMetadataItem(state_code="NE", number=2), + CongressionalDistrictMetadataItem(state_code="NE", number=3), + # Nevada - 4 districts + CongressionalDistrictMetadataItem(state_code="NV", number=1), + CongressionalDistrictMetadataItem(state_code="NV", number=2), + CongressionalDistrictMetadataItem(state_code="NV", number=3), + CongressionalDistrictMetadataItem(state_code="NV", number=4), + # New Hampshire - 2 districts + CongressionalDistrictMetadataItem(state_code="NH", number=1), + CongressionalDistrictMetadataItem(state_code="NH", number=2), + # New Jersey - 12 districts + CongressionalDistrictMetadataItem(state_code="NJ", number=1), + CongressionalDistrictMetadataItem(state_code="NJ", number=2), + CongressionalDistrictMetadataItem(state_code="NJ", number=3), + CongressionalDistrictMetadataItem(state_code="NJ", number=4), + CongressionalDistrictMetadataItem(state_code="NJ", number=5), + CongressionalDistrictMetadataItem(state_code="NJ", number=6), + CongressionalDistrictMetadataItem(state_code="NJ", number=7), + CongressionalDistrictMetadataItem(state_code="NJ", number=8), + CongressionalDistrictMetadataItem(state_code="NJ", number=9), + CongressionalDistrictMetadataItem(state_code="NJ", number=10), + CongressionalDistrictMetadataItem(state_code="NJ", number=11), + CongressionalDistrictMetadataItem(state_code="NJ", number=12), + # New Mexico - 3 districts + CongressionalDistrictMetadataItem(state_code="NM", number=1), + CongressionalDistrictMetadataItem(state_code="NM", number=2), + CongressionalDistrictMetadataItem(state_code="NM", number=3), + # New York - 26 districts + CongressionalDistrictMetadataItem(state_code="NY", number=1), + CongressionalDistrictMetadataItem(state_code="NY", number=2), + CongressionalDistrictMetadataItem(state_code="NY", number=3), + CongressionalDistrictMetadataItem(state_code="NY", number=4), + CongressionalDistrictMetadataItem(state_code="NY", number=5), + CongressionalDistrictMetadataItem(state_code="NY", number=6), + CongressionalDistrictMetadataItem(state_code="NY", number=7), + CongressionalDistrictMetadataItem(state_code="NY", number=8), + CongressionalDistrictMetadataItem(state_code="NY", number=9), + CongressionalDistrictMetadataItem(state_code="NY", number=10), + CongressionalDistrictMetadataItem(state_code="NY", number=11), + CongressionalDistrictMetadataItem(state_code="NY", number=12), + CongressionalDistrictMetadataItem(state_code="NY", number=13), + CongressionalDistrictMetadataItem(state_code="NY", number=14), + CongressionalDistrictMetadataItem(state_code="NY", number=15), + CongressionalDistrictMetadataItem(state_code="NY", number=16), + CongressionalDistrictMetadataItem(state_code="NY", number=17), + CongressionalDistrictMetadataItem(state_code="NY", number=18), + CongressionalDistrictMetadataItem(state_code="NY", number=19), + CongressionalDistrictMetadataItem(state_code="NY", number=20), + CongressionalDistrictMetadataItem(state_code="NY", number=21), + CongressionalDistrictMetadataItem(state_code="NY", number=22), + CongressionalDistrictMetadataItem(state_code="NY", number=23), + CongressionalDistrictMetadataItem(state_code="NY", number=24), + CongressionalDistrictMetadataItem(state_code="NY", number=25), + CongressionalDistrictMetadataItem(state_code="NY", number=26), + # North Carolina - 14 districts + CongressionalDistrictMetadataItem(state_code="NC", number=1), + CongressionalDistrictMetadataItem(state_code="NC", number=2), + CongressionalDistrictMetadataItem(state_code="NC", number=3), + CongressionalDistrictMetadataItem(state_code="NC", number=4), + CongressionalDistrictMetadataItem(state_code="NC", number=5), + CongressionalDistrictMetadataItem(state_code="NC", number=6), + CongressionalDistrictMetadataItem(state_code="NC", number=7), + CongressionalDistrictMetadataItem(state_code="NC", number=8), + CongressionalDistrictMetadataItem(state_code="NC", number=9), + CongressionalDistrictMetadataItem(state_code="NC", number=10), + CongressionalDistrictMetadataItem(state_code="NC", number=11), + CongressionalDistrictMetadataItem(state_code="NC", number=12), + CongressionalDistrictMetadataItem(state_code="NC", number=13), + CongressionalDistrictMetadataItem(state_code="NC", number=14), + # North Dakota - 1 at-large district + CongressionalDistrictMetadataItem(state_code="ND", number=1), + # Ohio - 15 districts + CongressionalDistrictMetadataItem(state_code="OH", number=1), + CongressionalDistrictMetadataItem(state_code="OH", number=2), + CongressionalDistrictMetadataItem(state_code="OH", number=3), + CongressionalDistrictMetadataItem(state_code="OH", number=4), + CongressionalDistrictMetadataItem(state_code="OH", number=5), + CongressionalDistrictMetadataItem(state_code="OH", number=6), + CongressionalDistrictMetadataItem(state_code="OH", number=7), + CongressionalDistrictMetadataItem(state_code="OH", number=8), + CongressionalDistrictMetadataItem(state_code="OH", number=9), + CongressionalDistrictMetadataItem(state_code="OH", number=10), + CongressionalDistrictMetadataItem(state_code="OH", number=11), + CongressionalDistrictMetadataItem(state_code="OH", number=12), + CongressionalDistrictMetadataItem(state_code="OH", number=13), + CongressionalDistrictMetadataItem(state_code="OH", number=14), + CongressionalDistrictMetadataItem(state_code="OH", number=15), + # Oklahoma - 5 districts + CongressionalDistrictMetadataItem(state_code="OK", number=1), + CongressionalDistrictMetadataItem(state_code="OK", number=2), + CongressionalDistrictMetadataItem(state_code="OK", number=3), + CongressionalDistrictMetadataItem(state_code="OK", number=4), + CongressionalDistrictMetadataItem(state_code="OK", number=5), + # Oregon - 6 districts + CongressionalDistrictMetadataItem(state_code="OR", number=1), + CongressionalDistrictMetadataItem(state_code="OR", number=2), + CongressionalDistrictMetadataItem(state_code="OR", number=3), + CongressionalDistrictMetadataItem(state_code="OR", number=4), + CongressionalDistrictMetadataItem(state_code="OR", number=5), + CongressionalDistrictMetadataItem(state_code="OR", number=6), + # Pennsylvania - 17 districts + CongressionalDistrictMetadataItem(state_code="PA", number=1), + CongressionalDistrictMetadataItem(state_code="PA", number=2), + CongressionalDistrictMetadataItem(state_code="PA", number=3), + CongressionalDistrictMetadataItem(state_code="PA", number=4), + CongressionalDistrictMetadataItem(state_code="PA", number=5), + CongressionalDistrictMetadataItem(state_code="PA", number=6), + CongressionalDistrictMetadataItem(state_code="PA", number=7), + CongressionalDistrictMetadataItem(state_code="PA", number=8), + CongressionalDistrictMetadataItem(state_code="PA", number=9), + CongressionalDistrictMetadataItem(state_code="PA", number=10), + CongressionalDistrictMetadataItem(state_code="PA", number=11), + CongressionalDistrictMetadataItem(state_code="PA", number=12), + CongressionalDistrictMetadataItem(state_code="PA", number=13), + CongressionalDistrictMetadataItem(state_code="PA", number=14), + CongressionalDistrictMetadataItem(state_code="PA", number=15), + CongressionalDistrictMetadataItem(state_code="PA", number=16), + CongressionalDistrictMetadataItem(state_code="PA", number=17), + # Rhode Island - 2 districts + CongressionalDistrictMetadataItem(state_code="RI", number=1), + CongressionalDistrictMetadataItem(state_code="RI", number=2), + # South Carolina - 7 districts + CongressionalDistrictMetadataItem(state_code="SC", number=1), + CongressionalDistrictMetadataItem(state_code="SC", number=2), + CongressionalDistrictMetadataItem(state_code="SC", number=3), + CongressionalDistrictMetadataItem(state_code="SC", number=4), + CongressionalDistrictMetadataItem(state_code="SC", number=5), + CongressionalDistrictMetadataItem(state_code="SC", number=6), + CongressionalDistrictMetadataItem(state_code="SC", number=7), + # South Dakota - 1 at-large district + CongressionalDistrictMetadataItem(state_code="SD", number=1), + # Tennessee - 9 districts + CongressionalDistrictMetadataItem(state_code="TN", number=1), + CongressionalDistrictMetadataItem(state_code="TN", number=2), + CongressionalDistrictMetadataItem(state_code="TN", number=3), + CongressionalDistrictMetadataItem(state_code="TN", number=4), + CongressionalDistrictMetadataItem(state_code="TN", number=5), + CongressionalDistrictMetadataItem(state_code="TN", number=6), + CongressionalDistrictMetadataItem(state_code="TN", number=7), + CongressionalDistrictMetadataItem(state_code="TN", number=8), + CongressionalDistrictMetadataItem(state_code="TN", number=9), + # Texas - 38 districts + CongressionalDistrictMetadataItem(state_code="TX", number=1), + CongressionalDistrictMetadataItem(state_code="TX", number=2), + CongressionalDistrictMetadataItem(state_code="TX", number=3), + CongressionalDistrictMetadataItem(state_code="TX", number=4), + CongressionalDistrictMetadataItem(state_code="TX", number=5), + CongressionalDistrictMetadataItem(state_code="TX", number=6), + CongressionalDistrictMetadataItem(state_code="TX", number=7), + CongressionalDistrictMetadataItem(state_code="TX", number=8), + CongressionalDistrictMetadataItem(state_code="TX", number=9), + CongressionalDistrictMetadataItem(state_code="TX", number=10), + CongressionalDistrictMetadataItem(state_code="TX", number=11), + CongressionalDistrictMetadataItem(state_code="TX", number=12), + CongressionalDistrictMetadataItem(state_code="TX", number=13), + CongressionalDistrictMetadataItem(state_code="TX", number=14), + CongressionalDistrictMetadataItem(state_code="TX", number=15), + CongressionalDistrictMetadataItem(state_code="TX", number=16), + CongressionalDistrictMetadataItem(state_code="TX", number=17), + CongressionalDistrictMetadataItem(state_code="TX", number=18), + CongressionalDistrictMetadataItem(state_code="TX", number=19), + CongressionalDistrictMetadataItem(state_code="TX", number=20), + CongressionalDistrictMetadataItem(state_code="TX", number=21), + CongressionalDistrictMetadataItem(state_code="TX", number=22), + CongressionalDistrictMetadataItem(state_code="TX", number=23), + CongressionalDistrictMetadataItem(state_code="TX", number=24), + CongressionalDistrictMetadataItem(state_code="TX", number=25), + CongressionalDistrictMetadataItem(state_code="TX", number=26), + CongressionalDistrictMetadataItem(state_code="TX", number=27), + CongressionalDistrictMetadataItem(state_code="TX", number=28), + CongressionalDistrictMetadataItem(state_code="TX", number=29), + CongressionalDistrictMetadataItem(state_code="TX", number=30), + CongressionalDistrictMetadataItem(state_code="TX", number=31), + CongressionalDistrictMetadataItem(state_code="TX", number=32), + CongressionalDistrictMetadataItem(state_code="TX", number=33), + CongressionalDistrictMetadataItem(state_code="TX", number=34), + CongressionalDistrictMetadataItem(state_code="TX", number=35), + CongressionalDistrictMetadataItem(state_code="TX", number=36), + CongressionalDistrictMetadataItem(state_code="TX", number=37), + CongressionalDistrictMetadataItem(state_code="TX", number=38), + # Utah - 4 districts + CongressionalDistrictMetadataItem(state_code="UT", number=1), + CongressionalDistrictMetadataItem(state_code="UT", number=2), + CongressionalDistrictMetadataItem(state_code="UT", number=3), + CongressionalDistrictMetadataItem(state_code="UT", number=4), + # Vermont - 1 at-large district + CongressionalDistrictMetadataItem(state_code="VT", number=1), + # Virginia - 11 districts + CongressionalDistrictMetadataItem(state_code="VA", number=1), + CongressionalDistrictMetadataItem(state_code="VA", number=2), + CongressionalDistrictMetadataItem(state_code="VA", number=3), + CongressionalDistrictMetadataItem(state_code="VA", number=4), + CongressionalDistrictMetadataItem(state_code="VA", number=5), + CongressionalDistrictMetadataItem(state_code="VA", number=6), + CongressionalDistrictMetadataItem(state_code="VA", number=7), + CongressionalDistrictMetadataItem(state_code="VA", number=8), + CongressionalDistrictMetadataItem(state_code="VA", number=9), + CongressionalDistrictMetadataItem(state_code="VA", number=10), + CongressionalDistrictMetadataItem(state_code="VA", number=11), + # Washington - 10 districts + CongressionalDistrictMetadataItem(state_code="WA", number=1), + CongressionalDistrictMetadataItem(state_code="WA", number=2), + CongressionalDistrictMetadataItem(state_code="WA", number=3), + CongressionalDistrictMetadataItem(state_code="WA", number=4), + CongressionalDistrictMetadataItem(state_code="WA", number=5), + CongressionalDistrictMetadataItem(state_code="WA", number=6), + CongressionalDistrictMetadataItem(state_code="WA", number=7), + CongressionalDistrictMetadataItem(state_code="WA", number=8), + CongressionalDistrictMetadataItem(state_code="WA", number=9), + CongressionalDistrictMetadataItem(state_code="WA", number=10), + # West Virginia - 2 districts + CongressionalDistrictMetadataItem(state_code="WV", number=1), + CongressionalDistrictMetadataItem(state_code="WV", number=2), + # Wisconsin - 8 districts + CongressionalDistrictMetadataItem(state_code="WI", number=1), + CongressionalDistrictMetadataItem(state_code="WI", number=2), + CongressionalDistrictMetadataItem(state_code="WI", number=3), + CongressionalDistrictMetadataItem(state_code="WI", number=4), + CongressionalDistrictMetadataItem(state_code="WI", number=5), + CongressionalDistrictMetadataItem(state_code="WI", number=6), + CongressionalDistrictMetadataItem(state_code="WI", number=7), + CongressionalDistrictMetadataItem(state_code="WI", number=8), + # Wyoming - 1 at-large district + CongressionalDistrictMetadataItem(state_code="WY", number=1), +] + + +def _get_ordinal_suffix(number: int) -> str: + """ + Get the ordinal suffix for a number (st, nd, rd, th). + + Examples: + 1 -> "st" + 2 -> "nd" + 3 -> "rd" + 4 -> "th" + 11 -> "th" + 21 -> "st" + 22 -> "nd" + """ + if 10 <= number % 100 <= 20: + # Special case for 11th, 12th, 13th, etc. + suffix = "th" + else: + suffix = {1: "st", 2: "nd", 3: "rd"}.get(number % 10, "th") + return suffix + + +def _format_district_number(number: int) -> str: + """ + Format district number with leading zero for single digits. + + Examples: + 1 -> "01" + 9 -> "09" + 10 -> "10" + 38 -> "38" + """ + return f"{number:02d}" + + +def _build_district_name(state_code: str, number: int) -> str: + """ + Build the district name in the format: congressional_district/- + + Examples: + ("CA", 5) -> "congressional_district/CA-05" + ("TX", 38) -> "congressional_district/TX-38" + ("DC", 1) -> "congressional_district/DC-01" + """ + return f"congressional_district/{state_code}-{_format_district_number(number)}" + + +def _build_district_label(state_code: str, number: int) -> str: + """ + Build the district label in the format: 's th congressional district + For at-large districts (states with only 1 district), use: 's at-large congressional district + + Examples: + ("CA", 1) -> "California's 1st congressional district" + ("NY", 2) -> "New York's 2nd congressional district" + ("TX", 3) -> "Texas's 3rd congressional district" + ("FL", 21) -> "Florida's 21st congressional district" + ("AK", 1) -> "Alaska's at-large congressional district" + ("WY", 1) -> "Wyoming's at-large congressional district" + """ + state_name = STATE_CODE_TO_NAME[state_code] + if state_code in AT_LARGE_STATES: + return f"{state_name}'s at-large congressional district" + ordinal_suffix = _get_ordinal_suffix(number) + return f"{state_name}'s {number}{ordinal_suffix} congressional district" + + +def build_congressional_district_metadata() -> list[dict]: + """ + Build the complete congressional district metadata structure for use in country.py. + + Returns a list of dictionaries with 'name', 'label', 'type', 'state_abbreviation', + and 'state_name' keys, formatted as: + [ + { + "name": "congressional_district/CA-01", + "label": "California's 1st congressional district", + "type": "congressional_district", + "state_abbreviation": "CA", + "state_name": "California" + }, + { + "name": "congressional_district/CA-02", + "label": "California's 2nd congressional district", + "type": "congressional_district", + "state_abbreviation": "CA", + "state_name": "California" + }, + ... + ] + + Returns: + List of 436 dictionaries (435 voting districts + DC) + """ + return [ + { + "name": _build_district_name(district.state_code, district.number), + "label": _build_district_label( + district.state_code, district.number + ), + "type": "congressional_district", + "state_abbreviation": district.state_code, + "state_name": STATE_CODE_TO_NAME[district.state_code], + } + for district in CONGRESSIONAL_DISTRICTS + ] + + +def get_valid_state_codes() -> set[str]: + """ + Get the set of valid US state codes (lowercase for case-insensitive matching). + + Returns: + Set of 51 lowercase state codes (50 states + DC) + """ + return {code.lower() for code in STATE_CODE_TO_NAME.keys()} + + +def get_valid_congressional_districts() -> set[str]: + """ + Get the set of valid congressional district identifiers (lowercase for case-insensitive matching). + + Format: "-" (e.g., "ca-37", "tx-01") + + Returns: + Set of 436 lowercase district identifiers + """ + return { + f"{district.state_code.lower()}-{_format_district_number(district.number)}" + for district in CONGRESSIONAL_DISTRICTS + } + + +def normalize_us_region(region: str) -> str: + """ + Normalize a US region string to the standard prefixed format. + + This function handles legacy region formats (bare state codes like "ca") + and converts them to the standard format ("state/ca"). It should be called + as early as possible when processing region inputs to ensure consistent + handling throughout the system. + + Args: + region: A region string that may be in legacy or standard format. + Examples: "ca", "state/ca", "nyc", "city/nyc", + "congressional_district/CA-01", "us" + + Returns: + The normalized region string with appropriate prefix. + Examples: "state/ca", "city/nyc", "congressional_district/CA-01", "us" + + Note: + This function does NOT validate that the region is valid - it only + normalizes the format. Use _validate_us_region for validation. + """ + # Already has a valid prefix - return as-is + if ( + region.startswith("state/") + or region.startswith("city/") + or region.startswith("congressional_district/") + ): + return region + + # National level - no prefix needed + if region == "us": + return region + + # Legacy NYC format + if region == "nyc": + return "city/nyc" + + # Legacy bare state code (e.g., "ca", "tx", "NY") + # Check if it's a valid state code before adding prefix + if region.lower() in get_valid_state_codes(): + return f"state/{region}" + + # Unknown format - return as-is and let validation catch it + return region diff --git a/policyengine_api/data/model_setup.py b/policyengine_api/data/model_setup.py index b734008e..a2a6a3ee 100644 --- a/policyengine_api/data/model_setup.py +++ b/policyengine_api/data/model_setup.py @@ -20,9 +20,12 @@ def get_dataset_version(country_id: str) -> str | None: """ - Get the latest dataset version for the specified country. If the country exists, but - no version is found, return None. If PolicyEngine does not publish data for the country, - raise a ValueError. + Get the dataset version for the specified country. If PolicyEngine does not + publish data for the country, raise a ValueError. + + By returning None for all valid countries, we allow policyengine.py to use + whatever default dataset version it has available, without imposing version + validation constraints from the API layer. """ match country_id: case "uk": diff --git a/policyengine_api/services/economy_service.py b/policyengine_api/services/economy_service.py index ae4f24be..68b86194 100644 --- a/policyengine_api/services/economy_service.py +++ b/policyengine_api/services/economy_service.py @@ -2,11 +2,20 @@ from policyengine_api.services.reform_impacts_service import ( ReformImpactsService, ) -from policyengine_api.constants import COUNTRY_PACKAGE_VERSIONS +from policyengine_api.constants import ( + COUNTRY_PACKAGE_VERSIONS, + REGION_PREFIXES, +) from policyengine_api.gcp_logging import logger from policyengine_api.libs.simulation_api import SimulationAPI from policyengine_api.data.model_setup import get_dataset_version +from policyengine_api.data.congressional_districts import ( + get_valid_state_codes, + get_valid_congressional_districts, + normalize_us_region, +) from policyengine.simulation import SimulationOptions +from policyengine.utils.data.datasets import get_default_dataset from google.cloud.workflows import executions_v1 import json import datetime @@ -138,6 +147,11 @@ def get_economic_impact( try: + # Normalize region early for US; this allows us to accommodate legacy + # regions that don't contain a region prefix. + if country_id == "us": + region = normalize_us_region(region) + # Set up logging process_id: str = self._create_process_id() @@ -394,7 +408,6 @@ def _handle_create_impact( region=setup_options.region, time_period=setup_options.time_period, scope="macro", - dataset=setup_options.dataset, include_cliffs=setup_options.target == "cliff", model_version=setup_options.model_version, data_version=setup_options.data_version, @@ -430,7 +443,6 @@ def _setup_sim_options( reform_policy: Annotated[str, "String-formatted JSON"], baseline_policy: Annotated[str, "String-formatted JSON"], region: str, - dataset: str | None, time_period: str, scope: Literal["macro", "household"] = "macro", include_cliffs: bool = False, @@ -452,9 +464,7 @@ def _setup_sim_options( "region": self._setup_region( country_id=country_id, region=region ), - "data": self._setup_data( - dataset=dataset, country_id=country_id, region=region - ), + "data": self._setup_data(country_id=country_id, region=region), "model_version": model_version, "data_version": data_version, } @@ -462,34 +472,59 @@ def _setup_sim_options( def _setup_region(self, country_id: str, region: str) -> str: """ - Convert API v1 'region' option to API v2-compatible 'region' option. + Validate the region for the given country. + + Assumes region has already been normalized (e.g., "ca" -> "state/ca"). + Raises ValueError for invalid regions. """ - # For US, states must be prefixed with 'state/' + # For US regions, validate (skip validation for national "us") if country_id == "us" and region != "us": - return "state/" + region + self._validate_us_region(region) return region - def _setup_data( - self, dataset: str | None, country_id: str, region: str - ) -> str | None: - """ - Take API v1 'data' string literals, which reference a dataset name, - and convert to relevant GCP filepath. In future, this should be - redone to use a more robust method of accessing datasets. + def _validate_us_region(self, region: str) -> None: """ + Validate a prefixed US region string. - # Enhanced CPS runs must reference ECPS dataset in Google Cloud bucket - if dataset == "enhanced_cps": - return "gs://policyengine-us-data/enhanced_cps_2024.h5" + Raises ValueError if the region is not valid. + """ + if region.startswith("state/"): + state_code = region[len("state/") :] + if state_code.lower() not in get_valid_state_codes(): + raise ValueError(f"Invalid US state: '{state_code}'") + elif region.startswith("city/"): + # Currently only NYC is supported + city_code = region[len("city/") :] + if city_code != "nyc": + raise ValueError(f"Invalid US city: '{city_code}'") + elif region.startswith("congressional_district/"): + district_id = region[len("congressional_district/") :] + if district_id.lower() not in get_valid_congressional_districts(): + raise ValueError( + f"Invalid congressional district: '{district_id}'" + ) + else: + raise ValueError(f"Invalid US region: '{region}'") - # NYC simulations must reference pooled CPS dataset - if region == "nyc": - return "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" + def _setup_data(self, country_id: str, region: str) -> str: + """ + Determine the dataset to use based on the country and region. - # All others (including US state-level simulations) receive no sim API 'data' arg - return None + Uses policyengine's get_default_dataset to resolve the appropriate + GCS path, making the dataset visible in GCP Console workflow inputs. + """ + try: + return get_default_dataset(country_id, region) + except ValueError as e: + logger.log_struct( + { + "message": f"Error getting default dataset for country={country_id}, region={region}: {str(e)}", + }, + severity="ERROR", + ) + raise # Note: The following methods that interface with the ReformImpactsService # are written separately because the service relies upon mutating an original diff --git a/setup.py b/setup.py index 239c2c67..c1232ce4 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ "policyengine_uk==2.39.0", "policyengine_us==1.457.0", "policyengine_core>=3.16.6", - "policyengine>=0.3.0", + "policyengine>=0.7.0", "pydantic", "pymysql", "python-dotenv", diff --git a/tests/fixtures/services/economy_service.py b/tests/fixtures/services/economy_service.py index de043853..83068bd0 100644 --- a/tests/fixtures/services/economy_service.py +++ b/tests/fixtures/services/economy_service.py @@ -182,3 +182,51 @@ def mock_execution_states(): "ACTIVE": executions_v1.Execution.State.ACTIVE, "CANCELLED": executions_v1.Execution.State.CANCELLED, } + + +# Expected GCS paths from get_default_dataset +MOCK_US_NATIONWIDE_DATASET = "gs://policyengine-us-data/cps_2023.h5" +MOCK_US_STATE_CA_DATASET = "gs://policyengine-us-data/states/CA.h5" +MOCK_US_STATE_UT_DATASET = "gs://policyengine-us-data/states/UT.h5" +MOCK_US_CITY_NYC_DATASET = ( + "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" +) +MOCK_US_DISTRICT_CA37_DATASET = "gs://policyengine-us-data/districts/CA-37.h5" +MOCK_UK_DATASET = "gs://policyengine-uk-data-private/enhanced_frs_2023_24.h5" + + +def mock_get_default_dataset_fn(country: str, region: str | None) -> str: + """Mock implementation of get_default_dataset for testing.""" + if country == "uk": + return MOCK_UK_DATASET + elif country == "us": + if region == "us" or region is None: + return MOCK_US_NATIONWIDE_DATASET + elif region == "state/ca": + return MOCK_US_STATE_CA_DATASET + elif region == "state/ut": + return MOCK_US_STATE_UT_DATASET + elif region == "city/nyc": + return MOCK_US_CITY_NYC_DATASET + elif region == "congressional_district/CA-37": + return MOCK_US_DISTRICT_CA37_DATASET + elif region.startswith("state/"): + # Generic state handling + state_code = region.split("/")[1].upper() + return f"gs://policyengine-us-data/states/{state_code}.h5" + elif region.startswith("congressional_district/"): + district_id = region.split("/")[1].upper() + return f"gs://policyengine-us-data/districts/{district_id}.h5" + else: + return MOCK_US_NATIONWIDE_DATASET + raise ValueError(f"Unknown country: {country}") + + +@pytest.fixture +def mock_get_default_dataset(): + """Mock get_default_dataset function.""" + with patch( + "policyengine_api.services.economy_service.get_default_dataset", + side_effect=mock_get_default_dataset_fn, + ) as mock: + yield mock diff --git a/tests/unit/data/__init__.py b/tests/unit/data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/data/test_congressional_districts.py b/tests/unit/data/test_congressional_districts.py new file mode 100644 index 00000000..05819916 --- /dev/null +++ b/tests/unit/data/test_congressional_districts.py @@ -0,0 +1,394 @@ +import pytest +from pydantic import ValidationError + +from policyengine_api.data.congressional_districts import ( + CongressionalDistrictMetadataItem, + CONGRESSIONAL_DISTRICTS, + STATE_CODE_TO_NAME, + AT_LARGE_STATES, + build_congressional_district_metadata, + get_valid_state_codes, + get_valid_congressional_districts, + normalize_us_region, +) + + +class TestCongressionalDistrictMetadataItem: + """Tests for the CongressionalDistrictMetadataItem Pydantic model.""" + + def test__given_valid_state_code_and_number__creates_item(self): + item = CongressionalDistrictMetadataItem(state_code="CA", number=37) + assert item.state_code == "CA" + assert item.number == 37 + + def test__given_lowercase_state_code__raises_validation_error(self): + with pytest.raises(ValidationError): + CongressionalDistrictMetadataItem(state_code="ca", number=1) + + def test__given_single_letter_state_code__raises_validation_error(self): + with pytest.raises(ValidationError): + CongressionalDistrictMetadataItem(state_code="C", number=1) + + def test__given_three_letter_state_code__raises_validation_error(self): + with pytest.raises(ValidationError): + CongressionalDistrictMetadataItem(state_code="CAL", number=1) + + def test__given_zero_district_number__raises_validation_error(self): + with pytest.raises(ValidationError): + CongressionalDistrictMetadataItem(state_code="CA", number=0) + + def test__given_negative_district_number__raises_validation_error(self): + with pytest.raises(ValidationError): + CongressionalDistrictMetadataItem(state_code="CA", number=-1) + + +class TestStateCodeToName: + """Tests for the STATE_CODE_TO_NAME mapping.""" + + def test__contains_50_states_plus_dc(self): + assert len(STATE_CODE_TO_NAME) == 51 + + def test__contains_all_state_codes_uppercase(self): + for code in STATE_CODE_TO_NAME.keys(): + assert code == code.upper() + assert len(code) == 2 + + def test__contains_dc(self): + assert "DC" in STATE_CODE_TO_NAME + assert STATE_CODE_TO_NAME["DC"] == "District of Columbia" + + def test__contains_california(self): + assert "CA" in STATE_CODE_TO_NAME + assert STATE_CODE_TO_NAME["CA"] == "California" + + +class TestCongressionalDistricts: + """Tests for the CONGRESSIONAL_DISTRICTS list.""" + + def test__contains_436_districts(self): + # 435 voting districts + 1 DC non-voting + assert len(CONGRESSIONAL_DISTRICTS) == 436 + + def test__all_items_are_valid_metadata_items(self): + for district in CONGRESSIONAL_DISTRICTS: + assert isinstance(district, CongressionalDistrictMetadataItem) + + def test__all_state_codes_are_in_state_code_to_name(self): + for district in CONGRESSIONAL_DISTRICTS: + assert district.state_code in STATE_CODE_TO_NAME + + def test__california_has_52_districts(self): + ca_districts = [ + d for d in CONGRESSIONAL_DISTRICTS if d.state_code == "CA" + ] + assert len(ca_districts) == 52 + + def test__texas_has_38_districts(self): + tx_districts = [ + d for d in CONGRESSIONAL_DISTRICTS if d.state_code == "TX" + ] + assert len(tx_districts) == 38 + + def test__at_large_states_have_1_district(self): + # States with only 1 at-large representative (excluding DC which is special) + at_large_states = [s for s in AT_LARGE_STATES if s != "DC"] + for state_code in at_large_states: + state_districts = [ + d + for d in CONGRESSIONAL_DISTRICTS + if d.state_code == state_code + ] + assert len(state_districts) == 1 + assert state_districts[0].number == 1 + + def test__dc_has_1_district(self): + dc_districts = [ + d for d in CONGRESSIONAL_DISTRICTS if d.state_code == "DC" + ] + assert len(dc_districts) == 1 + assert dc_districts[0].number == 1 + + def test__dc_comes_after_delaware(self): + # Find indices + de_indices = [ + i + for i, d in enumerate(CONGRESSIONAL_DISTRICTS) + if d.state_code == "DE" + ] + dc_indices = [ + i + for i, d in enumerate(CONGRESSIONAL_DISTRICTS) + if d.state_code == "DC" + ] + # DC should come after all DE districts + assert min(dc_indices) > max(de_indices) + + +class TestBuildCongressionalDistrictMetadata: + """Tests for the build_congressional_district_metadata function.""" + + def test__returns_list_of_436_items(self): + metadata = build_congressional_district_metadata() + assert len(metadata) == 436 + + def test__each_item_has_required_keys(self): + metadata = build_congressional_district_metadata() + for item in metadata: + assert "name" in item + assert "label" in item + assert "type" in item + assert "state_abbreviation" in item + assert "state_name" in item + + def test__name_has_correct_format(self): + metadata = build_congressional_district_metadata() + # Check first California district + ca_01 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-01" + ) + assert ca_01 is not None + + def test__label_has_correct_format(self): + metadata = build_congressional_district_metadata() + ca_01 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-01" + ) + assert ca_01["label"] == "California's 1st congressional district" + + def test__state_abbreviation_is_uppercase(self): + metadata = build_congressional_district_metadata() + for item in metadata: + assert ( + item["state_abbreviation"] + == item["state_abbreviation"].upper() + ) + assert len(item["state_abbreviation"]) == 2 + + def test__state_name_matches_abbreviation(self): + metadata = build_congressional_district_metadata() + ca_01 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-01" + ) + assert ca_01["state_abbreviation"] == "CA" + assert ca_01["state_name"] == "California" + + def test__dc_state_fields(self): + metadata = build_congressional_district_metadata() + dc_01 = next( + item + for item in metadata + if item["name"] == "congressional_district/DC-01" + ) + assert dc_01["state_abbreviation"] == "DC" + assert dc_01["state_name"] == "District of Columbia" + + def test__type_is_congressional_district(self): + metadata = build_congressional_district_metadata() + for item in metadata: + assert item["type"] == "congressional_district" + + def test__ordinal_suffixes_are_correct(self): + metadata = build_congressional_district_metadata() + + # Find specific districts to test ordinal suffixes + ca_01 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-01" + ) + ca_02 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-02" + ) + ca_03 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-03" + ) + ca_11 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-11" + ) + ca_12 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-12" + ) + ca_21 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-21" + ) + ca_22 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-22" + ) + + assert "1st" in ca_01["label"] + assert "2nd" in ca_02["label"] + assert "3rd" in ca_03["label"] + assert "11th" in ca_11["label"] + assert "12th" in ca_12["label"] + assert "21st" in ca_21["label"] + assert "22nd" in ca_22["label"] + + def test__district_numbers_have_leading_zeros(self): + metadata = build_congressional_district_metadata() + # Single digit districts should have leading zero + ca_01 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-01" + ) + assert ca_01["name"] == "congressional_district/CA-01" + + # Double digit districts should not have leading zero + ca_37 = next( + item + for item in metadata + if item["name"] == "congressional_district/CA-37" + ) + assert ca_37["name"] == "congressional_district/CA-37" + + def test__at_large_states_have_at_large_label(self): + metadata = build_congressional_district_metadata() + # All at-large states should have "at-large" in label + for state_code in AT_LARGE_STATES: + district = next( + item + for item in metadata + if item["name"] == f"congressional_district/{state_code}-01" + ) + assert ( + "at-large congressional district" in district["label"] + ), f"{state_code} should have at-large label" + + def test__alaska_at_large_label(self): + metadata = build_congressional_district_metadata() + ak_01 = next( + item + for item in metadata + if item["name"] == "congressional_district/AK-01" + ) + assert ak_01["label"] == "Alaska's at-large congressional district" + + def test__wyoming_at_large_label(self): + metadata = build_congressional_district_metadata() + wy_01 = next( + item + for item in metadata + if item["name"] == "congressional_district/WY-01" + ) + assert wy_01["label"] == "Wyoming's at-large congressional district" + + +class TestGetValidStateCodes: + """Tests for the get_valid_state_codes function.""" + + def test__returns_set_of_51_codes(self): + codes = get_valid_state_codes() + assert len(codes) == 51 + + def test__all_codes_are_lowercase(self): + codes = get_valid_state_codes() + for code in codes: + assert code == code.lower() + + def test__contains_california(self): + codes = get_valid_state_codes() + assert "ca" in codes + + def test__contains_dc(self): + codes = get_valid_state_codes() + assert "dc" in codes + + def test__does_not_contain_invalid_codes(self): + codes = get_valid_state_codes() + assert "xx" not in codes + assert "mb" not in codes # Manitoba (Canadian province) + + +class TestGetValidCongressionalDistricts: + """Tests for the get_valid_congressional_districts function.""" + + def test__returns_set_of_436_districts(self): + districts = get_valid_congressional_districts() + assert len(districts) == 436 + + def test__all_districts_are_lowercase(self): + districts = get_valid_congressional_districts() + for district in districts: + assert district == district.lower() + + def test__contains_california_37(self): + districts = get_valid_congressional_districts() + assert "ca-37" in districts + + def test__contains_dc_01(self): + districts = get_valid_congressional_districts() + assert "dc-01" in districts + + def test__single_digit_districts_have_leading_zero(self): + districts = get_valid_congressional_districts() + assert "ca-01" in districts + assert "ca-1" not in districts + + def test__does_not_contain_invalid_districts(self): + districts = get_valid_congressional_districts() + assert "ca-99" not in districts + assert "xx-01" not in districts + assert "cruft" not in districts + + +class TestNormalizeUsRegion: + """Tests for the normalize_us_region function.""" + + def test__national_us_unchanged(self): + assert normalize_us_region("us") == "us" + + def test__prefixed_state_unchanged(self): + assert normalize_us_region("state/ca") == "state/ca" + assert normalize_us_region("state/TX") == "state/TX" + + def test__prefixed_city_unchanged(self): + assert normalize_us_region("city/nyc") == "city/nyc" + + def test__prefixed_congressional_district_unchanged(self): + assert ( + normalize_us_region("congressional_district/CA-37") + == "congressional_district/CA-37" + ) + assert ( + normalize_us_region("congressional_district/tx-14") + == "congressional_district/tx-14" + ) + + def test__legacy_nyc_converted(self): + assert normalize_us_region("nyc") == "city/nyc" + + def test__legacy_state_code_lowercase_converted(self): + assert normalize_us_region("ca") == "state/ca" + assert normalize_us_region("tx") == "state/tx" + assert normalize_us_region("ny") == "state/ny" + + def test__legacy_state_code_uppercase_converted(self): + assert normalize_us_region("CA") == "state/CA" + assert normalize_us_region("TX") == "state/TX" + + def test__legacy_dc_converted(self): + assert normalize_us_region("dc") == "state/dc" + assert normalize_us_region("DC") == "state/DC" + + def test__unknown_region_returned_unchanged(self): + # Unknown regions are returned as-is for validation to catch + assert normalize_us_region("invalid") == "invalid" + assert normalize_us_region("mb") == "mb" # Manitoba (Canadian) diff --git a/tests/unit/data/test_model_setup.py b/tests/unit/data/test_model_setup.py new file mode 100644 index 00000000..45c69a11 --- /dev/null +++ b/tests/unit/data/test_model_setup.py @@ -0,0 +1,31 @@ +import pytest + +from policyengine_api.data.model_setup import get_dataset_version + + +class TestGetDatasetVersion: + """Tests for the get_dataset_version function.""" + + def test__given_us__returns_none(self): + result = get_dataset_version("us") + assert result is None + + def test__given_uk__returns_none(self): + result = get_dataset_version("uk") + assert result is None + + def test__given_invalid_country__raises_value_error(self): + with pytest.raises(ValueError) as exc_info: + get_dataset_version("invalid") + assert "Unknown country ID: invalid" in str(exc_info.value) + + def test__given_empty_string__raises_value_error(self): + with pytest.raises(ValueError) as exc_info: + get_dataset_version("") + assert "Unknown country ID:" in str(exc_info.value) + + def test__given_canada__raises_value_error(self): + # Canada is a valid country in the API but doesn't have dataset versioning + with pytest.raises(ValueError) as exc_info: + get_dataset_version("ca") + assert "Unknown country ID: ca" in str(exc_info.value) diff --git a/tests/unit/services/test_economy_service.py b/tests/unit/services/test_economy_service.py index 4f63672a..d870a627 100644 --- a/tests/unit/services/test_economy_service.py +++ b/tests/unit/services/test_economy_service.py @@ -522,35 +522,35 @@ def test__given_valid_data__creates_instance(self): assert options.options_hash == MOCK_OPTIONS_HASH class TestSetupSimOptions: + """Tests for _setup_sim_options method. + + Note: _setup_sim_options now expects pre-normalized regions and returns + GCS paths in the data field (not None). + """ + test_country_id = "us" test_reform_policy = json.dumps( {"sample_param": {"2024-01-01.2100-12-31": 15}} ) test_current_law_baseline_policy = json.dumps({}) test_region = "us" - test_dataset = None test_time_period = 2025 test_scope: Literal["macro"] = "macro" - def test__given_valid_options__returns_correct_sim_options(self): - - # Create an instance of the class + def test__given_us_nationwide__returns_correct_sim_options(self): service = EconomyService() - # Call the method with the test data; patch setup_region and setup_data methods sim_options_model = service._setup_sim_options( self.test_country_id, self.test_reform_policy, self.test_current_law_baseline_policy, self.test_region, - self.test_dataset, self.test_time_period, self.test_scope, ) sim_options = sim_options_model.model_dump() - # Assert the expected values in the returned dictionary assert sim_options["country"] == self.test_country_id assert sim_options["scope"] == self.test_scope assert sim_options["reform"] == json.loads(self.test_reform_policy) @@ -559,34 +559,32 @@ def test__given_valid_options__returns_correct_sim_options(self): ) assert sim_options["time_period"] == self.test_time_period assert sim_options["region"] == "us" - assert sim_options["data"] == None + assert ( + sim_options["data"] == "gs://policyengine-us-data/cps_2023.h5" + ) - def test__given_us_state__returns_correct_sim_options(self): - # Test with a US state + def test__given_us_state_ca__returns_correct_sim_options(self): + # Test with a normalized US state (prefixed format) country_id = "us" reform_policy = json.dumps( {"sample_param": {"2024-01-01.2100-12-31": 15}} ) current_law_baseline_policy = json.dumps({}) - region = "ca" - dataset = None + region = "state/ca" # Pre-normalized time_period = 2025 scope = "macro" - # Create an instance of the class service = EconomyService() - # Call the method sim_options_model = service._setup_sim_options( country_id, reform_policy, current_law_baseline_policy, region, - dataset, time_period, scope, ) - # Assert the expected values in the returned dictionary sim_options = sim_options_model.model_dump() + assert sim_options["country"] == country_id assert sim_options["scope"] == scope assert sim_options["reform"] == json.loads(reform_policy) @@ -595,34 +593,32 @@ def test__given_us_state__returns_correct_sim_options(self): ) assert sim_options["time_period"] == time_period assert sim_options["region"] == "state/ca" - assert sim_options["data"] is None + assert ( + sim_options["data"] == "gs://policyengine-us-data/states/CA.h5" + ) - def test__given_enhanced_cps_state__returns_correct_sim_options(self): - # Test with enhanced_cps dataset + def test__given_us_state_utah__returns_correct_sim_options(self): + # Test with normalized Utah state country_id = "us" reform_policy = json.dumps( {"sample_param": {"2024-01-01.2100-12-31": 15}} ) current_law_baseline_policy = json.dumps({}) - region = "ut" - dataset = "enhanced_cps" + region = "state/ut" # Pre-normalized time_period = 2025 scope = "macro" - # Create an instance of the class service = EconomyService() - # Call the method sim_options_model = service._setup_sim_options( country_id, reform_policy, current_law_baseline_policy, region, - dataset, time_period, scope, ) sim_options = sim_options_model.model_dump() - # Assert the expected values in the returned dictionary + assert sim_options["country"] == country_id assert sim_options["scope"] == scope assert sim_options["reform"] == json.loads(reform_policy) @@ -632,8 +628,7 @@ def test__given_enhanced_cps_state__returns_correct_sim_options(self): assert sim_options["time_period"] == time_period assert sim_options["region"] == "state/ut" assert ( - sim_options["data"] - == "gs://policyengine-us-data/enhanced_cps_2024.h5" + sim_options["data"] == "gs://policyengine-us-data/states/UT.h5" ) def test__given_cliff_target__returns_correct_sim_options(self): @@ -643,27 +638,21 @@ def test__given_cliff_target__returns_correct_sim_options(self): ) current_law_baseline_policy = json.dumps({}) region = "us" - dataset = None time_period = 2025 scope = "macro" - target = "cliff" - # Create an instance of the class service = EconomyService() - # Call the method sim_options_model = service._setup_sim_options( country_id, reform_policy, current_law_baseline_policy, region, - dataset, time_period, scope, - include_cliffs=target == "cliff", + include_cliffs=True, ) - # Assert the expected values in the returned dictionary sim_options = sim_options_model.model_dump() assert sim_options["country"] == country_id assert sim_options["scope"] == scope @@ -673,100 +662,250 @@ def test__given_cliff_target__returns_correct_sim_options(self): ) assert sim_options["time_period"] == time_period assert sim_options["region"] == region - assert sim_options["data"] == None - assert sim_options["include_cliffs"] == True + assert ( + sim_options["data"] == "gs://policyengine-us-data/cps_2023.h5" + ) + assert sim_options["include_cliffs"] is True - class TestSetupRegion: - def test__given_us_state__returns_correct_region(self): - # Test with a US state + def test__given_uk__returns_correct_sim_options(self): + country_id = "uk" + reform_policy = json.dumps( + {"sample_param": {"2024-01-01.2100-12-31": 15}} + ) + current_law_baseline_policy = json.dumps({}) + region = "uk" + time_period = 2025 + scope = "macro" + + service = EconomyService() + + sim_options_model = service._setup_sim_options( + country_id, + reform_policy, + current_law_baseline_policy, + region, + time_period, + scope, + ) + + sim_options = sim_options_model.model_dump() + assert sim_options["country"] == country_id + assert sim_options["region"] == region + assert ( + sim_options["data"] + == "gs://policyengine-uk-data-private/enhanced_frs_2023_24.h5" + ) + + def test__given_congressional_district__returns_correct_sim_options( + self, + ): country_id = "us" - # US states always lowercase two-letter codes - region = "ca" + reform_policy = json.dumps( + {"sample_param": {"2024-01-01.2100-12-31": 15}} + ) + current_law_baseline_policy = json.dumps({}) + region = "congressional_district/CA-37" # Pre-normalized + time_period = 2025 + scope = "macro" - # Create an instance of the class service = EconomyService() - # Call the method - result = service._setup_region(country_id, region) - # Assert the expected value + sim_options_model = service._setup_sim_options( + country_id, + reform_policy, + current_law_baseline_policy, + region, + time_period, + scope, + ) + + sim_options = sim_options_model.model_dump() + assert sim_options["region"] == "congressional_district/CA-37" + assert ( + sim_options["data"] + == "gs://policyengine-us-data/districts/CA-37.h5" + ) + + class TestSetupRegion: + """Tests for _setup_region method. + + Note: _setup_region now only validates regions - it assumes normalization + has already been done by normalize_us_region() earlier in the flow. + """ + + def test__given_prefixed_us_state__returns_unchanged(self): + # Test with a normalized US state (prefixed format) + service = EconomyService() + result = service._setup_region("us", "state/ca") assert result == "state/ca" - def test__given_non_us_state__returns_correct_region(self): - # Test with non-US region - country_id = "uk" - region = "country/england" + def test__given_non_us_region__returns_unchanged(self): + # Test with non-US region - no validation performed + service = EconomyService() + result = service._setup_region("uk", "country/england") + assert result == "country/england" - # Create an instance of the class + def test__given_us_national__returns_us(self): service = EconomyService() - # Call the method - result = service._setup_region(country_id, region) - # Assert the expected value - assert result == region + result = service._setup_region("us", "us") + assert result == "us" - class TestSetupData: - def test__given_enhanced_cps_dataset__returns_correct_gcp_path(self): - # Test with enhanced_cps dataset - dataset = "enhanced_cps" - country_id = "us" - region = "us" + def test__given_prefixed_state_tx__returns_unchanged(self): + service = EconomyService() + result = service._setup_region("us", "state/tx") + assert result == "state/tx" - # Create an instance of the class + def test__given_congressional_district__returns_unchanged(self): service = EconomyService() - # Call the method - result = service._setup_data(dataset, country_id, region) - # Assert the expected value - assert result == "gs://policyengine-us-data/enhanced_cps_2024.h5" - - def test__given_us_state_dataset__returns_none(self): - # Test with US state dataset - should return None - dataset = "us_state" - country_id = "us" - region = "ca" + result = service._setup_region( + "us", "congressional_district/CA-37" + ) + assert result == "congressional_district/CA-37" - # Create an instance of the class + def test__given_lowercase_congressional_district__returns_unchanged( + self, + ): service = EconomyService() - # Call the method - result = service._setup_data(dataset, country_id, region) - # Assert the expected value - assert result is None + result = service._setup_region( + "us", "congressional_district/ca-37" + ) + assert result == "congressional_district/ca-37" - def test__given_nyc_region__returns_pooled_cps(self): - # Test with NYC region - should return pooled CPS dataset - dataset = None - country_id = "us" - region = "nyc" + def test__given_invalid_prefixed_state__raises_value_error(self): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._setup_region("us", "state/mb") + assert "Invalid US state: 'mb'" in str(exc_info.value) - # Create an instance of the class + def test__given_invalid_congressional_district__raises_value_error( + self, + ): service = EconomyService() - # Call the method - result = service._setup_data(dataset, country_id, region) - # Assert the expected value + with pytest.raises(ValueError) as exc_info: + service._setup_region("us", "congressional_district/cruft") + assert "Invalid congressional district: 'cruft'" in str( + exc_info.value + ) + + def test__given_invalid_prefix__raises_value_error(self): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._setup_region("us", "invalid_prefix/tx") + assert "Invalid US region: 'invalid_prefix/tx'" in str( + exc_info.value + ) + + def test__given_invalid_bare_value__raises_value_error(self): + # Bare values without prefix are now invalid (should be normalized first) + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._setup_region("us", "invalid_value") + assert "Invalid US region: 'invalid_value'" in str(exc_info.value) + + def test__given_city_nyc__returns_unchanged(self): + # Test normalized "city/nyc" format passes through + service = EconomyService() + result = service._setup_region("us", "city/nyc") + assert result == "city/nyc" + + class TestSetupData: + """Tests for _setup_data method. + + Note: _setup_data now uses get_default_dataset from policyengine package + to return GCS paths for all region types (not None). + """ + + def test__given_us_city_nyc__returns_pooled_cps(self): + # Test with normalized city/nyc format + service = EconomyService() + result = service._setup_data("us", "city/nyc") assert ( result == "gs://policyengine-us-data/pooled_3_year_cps_2023.h5" ) - def test__given_us_nationwide_dataset__returns_none(self): - # Test with US nationwide dataset - dataset = "us_nationwide" - country_id = "us" - region = "us" + def test__given_us_state_ca__returns_state_dataset(self): + # Test with US state - returns state-specific dataset + service = EconomyService() + result = service._setup_data("us", "state/ca") + assert result == "gs://policyengine-us-data/states/CA.h5" - # Create an instance of the class + def test__given_us_state_ut__returns_state_dataset(self): + # Test with Utah state - returns state-specific dataset service = EconomyService() - # Call the method - result = service._setup_data(dataset, country_id, region) - # Assert the expected value - assert result is None + result = service._setup_data("us", "state/ut") + assert result == "gs://policyengine-us-data/states/UT.h5" - def test__given_uk_dataset__returns_none(self): - # Test with UK dataset - dataset = "uk_dataset" - country_id = "uk" - region = "country/england" + def test__given_us_nationwide__returns_cps_dataset(self): + # Test with US nationwide region + service = EconomyService() + result = service._setup_data("us", "us") + assert result == "gs://policyengine-us-data/cps_2023.h5" - # Create an instance of the class + def test__given_congressional_district__returns_district_dataset(self): + # Test with congressional district - returns district-specific dataset service = EconomyService() - # Call the method - result = service._setup_data(dataset, country_id, region) - # Assert the expected value - assert result is None + result = service._setup_data("us", "congressional_district/CA-37") + assert result == "gs://policyengine-us-data/districts/CA-37.h5" + + def test__given_uk__returns_efrs_dataset(self): + # Test with UK - returns enhanced FRS dataset + service = EconomyService() + result = service._setup_data("uk", "uk") + assert ( + result + == "gs://policyengine-uk-data-private/enhanced_frs_2023_24.h5" + ) + + def test__given_invalid_country__raises_value_error(self, mock_logger): + # Test with invalid country + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._setup_data("invalid", "region") + assert "invalid" in str(exc_info.value).lower() + + class TestValidateUsRegion: + """Tests for the _validate_us_region method.""" + + def test__given_valid_state__does_not_raise(self): + service = EconomyService() + # Should not raise + service._validate_us_region("state/ca") + + def test__given_valid_state_uppercase__does_not_raise(self): + service = EconomyService() + # Case-insensitive validation + service._validate_us_region("state/CA") + + def test__given_invalid_state__raises_value_error(self): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._validate_us_region("state/mb") + assert "Invalid US state: 'mb'" in str(exc_info.value) + + def test__given_valid_congressional_district__does_not_raise(self): + service = EconomyService() + service._validate_us_region("congressional_district/CA-37") + + def test__given_valid_congressional_district_lowercase__does_not_raise( + self, + ): + service = EconomyService() + service._validate_us_region("congressional_district/ca-37") + + def test__given_invalid_congressional_district__raises_value_error( + self, + ): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._validate_us_region("congressional_district/CA-99") + assert "Invalid congressional district: 'CA-99'" in str( + exc_info.value + ) + + def test__given_nonexistent_district__raises_value_error(self): + service = EconomyService() + with pytest.raises(ValueError) as exc_info: + service._validate_us_region("congressional_district/cruft") + assert "Invalid congressional district: 'cruft'" in str( + exc_info.value + ) diff --git a/tests/unit/services/test_metadata_service.py b/tests/unit/services/test_metadata_service.py index 80163b4c..ac33d525 100644 --- a/tests/unit/services/test_metadata_service.py +++ b/tests/unit/services/test_metadata_service.py @@ -46,7 +46,18 @@ def test_get_metadata_empty_country_id(self): "country/ni", ], ), - ("us", 2, ["us", "ca", "ny", "tx", "fl"]), + ( + "us", + 2, + [ + "us", + "state/ca", + "state/ny", + "state/tx", + "state/fl", + "city/nyc", + ], + ), ("ca", 3, ["ca"]), ("ng", 4, ["ng"]), ("il", 5, ["il"]), @@ -108,3 +119,29 @@ def test_verify_metadata_for_given_country( # Verify datasets exist and are of correct type assert "datasets" in metadata["economy_options"] assert isinstance(metadata["economy_options"]["datasets"], list) + + @pytest.mark.parametrize( + "country_id, expected_types", + [ + ("uk", ["national", "country", "constituency"]), + ("us", ["national", "state", "city", "congressional_district"]), + ], + ) + def test_verify_region_types_for_given_country( + self, country_id, expected_types + ): + """ + Verifies that all regions for UK and US have a 'type' field + with valid values. + """ + service = MetadataService() + metadata = service.get_metadata(country_id) + + regions = metadata["economy_options"]["region"] + for region in regions: + assert ( + "type" in region + ), f"Region '{region['name']}' missing 'type' field" + assert ( + region["type"] in expected_types + ), f"Region '{region['name']}' has invalid type '{region['type']}'"