From cd6cf58763ef079a2f434bd3c3915de31f7a1d3b Mon Sep 17 00:00:00 2001 From: baogorek Date: Tue, 29 Jul 2025 08:56:11 -0400 Subject: [PATCH 01/27] first round of eitc targets are added --- .../db/load_treasury_targets.py | 162 ++++++++++++++++++ pyproject.toml | 1 + 2 files changed, 163 insertions(+) create mode 100644 policyengine_us_data/db/load_treasury_targets.py diff --git a/policyengine_us_data/db/load_treasury_targets.py b/policyengine_us_data/db/load_treasury_targets.py new file mode 100644 index 00000000..4326f269 --- /dev/null +++ b/policyengine_us_data/db/load_treasury_targets.py @@ -0,0 +1,162 @@ +import logging +import requests +from pathlib import Path +import io + +import pandas as pd +import numpy as np +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from policyengine_us_data.db.create_database_tables import ( + Stratum, + StratumConstraint, + Target, +) + + +logger = logging.getLogger(__name__) + + +def extract_eitc_data(): + # IRS Table 2.5, Tax Year 2020S + url = "https://www.irs.gov/pub/irs-soi/20in25ic.xls" + r = requests.get(url, timeout=30) + r.raise_for_status() + + # Pandas uses xlrd to open .xls + xls = pd.ExcelFile(io.BytesIO(r.content), engine="xlrd") + sheets = {name: xls.parse(name, header=None) for name in xls.sheet_names} + + raw = sheets[xls.sheet_names[0]] + return raw + + +def transform_eitc_data(raw_data): + # This is not ideal from a data processing standpoint, but it's too much + # effort to fully parse this hierarchical XLS for a few data points + # At least the full lineage is represented from the source + + zero_children_returns = raw_data.iloc[8, 25] + zero_children_amount = raw_data.iloc[8, 26] * 1000 + + one_child_returns = raw_data.iloc[8, 39] + one_child_amount = raw_data.iloc[8, 40] * 1000 + + two_children_returns = raw_data.iloc[8, 57] + two_children_amount = raw_data.iloc[8, 58] * 1000 + + three_plus_children_returns = raw_data.iloc[8, 73] + three_plus_children_amount = raw_data.iloc[8, 74] * 1000 + + assert zero_children_returns == 7636714 + assert zero_children_amount == 2255068000 + + df_long = pd.DataFrame([ + ["0100000US", "children_equal_to", 0, "tax_unit_count", zero_children_returns], + ["0100000US", "children_equal_to", 1, "tax_unit_count", one_child_returns], + ["0100000US", "children_equal_to", 2, "tax_unit_count", two_children_returns], + ["0100000US", "children_greater_or_equal_to", 3, "tax_unit_count", three_plus_children_returns], + ["0100000US", "children_equal_to", 0, "eitc", zero_children_amount], + ["0100000US", "children_equal_to", 1, "eitc", one_child_returns], + ["0100000US", "children_equal_to", 2, "eitc", two_children_returns], + ["0100000US", "children_greater_or_equal_to", 3, "eitc", three_plus_children_returns], + ]) + + df_long.columns = ["ucgid", "constraint", "constraint_value", "variable", "value"] + + df_long["period"] = 2020 + df_long["reform_id"] = 0 + df_long["source_id"] = 2 + df_long["active"] = True + + return df_long + + +def load_eitc_data(df_long): + + DATABASE_URL = "sqlite:///policy_data.db" + engine = create_engine(DATABASE_URL) + + Session = sessionmaker(bind=engine) + session = Session() + + ucgid = df_long.iloc[0]['ucgid'] + for num_children in [0, 1, 2, 3]: + note = f"eitc_child_count: {num_children}, Geo: {ucgid}" + new_stratum = Stratum( + parent_stratum_id=None, stratum_group_id=0, notes=note + ) + + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid", + operation="equals", + value=ucgid, + ), + ] + + if num_children <= 2: + new_stratum.constraints_rel.append( + StratumConstraint( + constraint_variable="eitc_child_count", + operation="equals", + value=str(num_children), + ), + ) + elif num_children > 2: + new_stratum.constraints_rel.append( + StratumConstraint( + constraint_variable="eitc_child_count", + operation="greater_or_equal_than", + value=str(3), + ), + ) + + rows = df_long.loc[df_long['constraint_value'] == num_children] + count_target = rows.loc[rows.variable == 'tax_unit_count']['value'].values[0] + amount_target = rows.loc[rows.variable == 'eitc']['value'].values[0] + + # Avoiding magic numbers in the load step + count_active = rows.loc[rows.variable == 'tax_unit_count']['active'].values[0] + amount_active = rows.loc[rows.variable == 'eitc']['active'].values[0] + + period = rows.iloc[0]['period'] + source_id = rows.iloc[0]['source_id'] + + new_stratum.targets_rel = [ + Target( + variable="eitc", + period=period, + value=amount_target, + source_id=source_id, + active=amount_active, + ), + Target( + variable="tax_unit_count", + period=period, + value=amount_target, + source_id=source_id, + active=count_active, + ), + ] + + session.add(new_stratum) + session.flush() + print(new_stratum.stratum_id) + + session.commit() + + +if __name__ == "__main__": + + # --- ETL: Extract, Transform, Load ---- + + # ---- Extract ---------- + national_df = extract_eitc_data() + + # --- Transform ---------- + long_national_df = transform_eitc_data(national_df) + + # --- Load -------- + state_strata_lku = load_eitc_data(long_national_df) diff --git a/pyproject.toml b/pyproject.toml index aac1a318..21e53d17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ dependencies = [ "us>=2.0.0", "sqlalchemy>=2.0.41", "sqlmodel>=0.0.24", + "xlrd>=2.0.2", ] [project.optional-dependencies] From 867bec647f48d2c675aa580fc10241c65154801d Mon Sep 17 00:00:00 2001 From: baogorek Date: Tue, 29 Jul 2025 08:56:44 -0400 Subject: [PATCH 02/27] linting --- .../db/load_treasury_targets.py | 124 +++++++++++++----- 1 file changed, 89 insertions(+), 35 deletions(-) diff --git a/policyengine_us_data/db/load_treasury_targets.py b/policyengine_us_data/db/load_treasury_targets.py index 4326f269..20d52cef 100644 --- a/policyengine_us_data/db/load_treasury_targets.py +++ b/policyengine_us_data/db/load_treasury_targets.py @@ -21,13 +21,13 @@ def extract_eitc_data(): # IRS Table 2.5, Tax Year 2020S url = "https://www.irs.gov/pub/irs-soi/20in25ic.xls" - r = requests.get(url, timeout=30) + r = requests.get(url, timeout=30) r.raise_for_status() - + # Pandas uses xlrd to open .xls xls = pd.ExcelFile(io.BytesIO(r.content), engine="xlrd") sheets = {name: xls.parse(name, header=None) for name in xls.sheet_names} - + raw = sheets[xls.sheet_names[0]] return raw @@ -39,32 +39,82 @@ def transform_eitc_data(raw_data): zero_children_returns = raw_data.iloc[8, 25] zero_children_amount = raw_data.iloc[8, 26] * 1000 - + one_child_returns = raw_data.iloc[8, 39] one_child_amount = raw_data.iloc[8, 40] * 1000 - + two_children_returns = raw_data.iloc[8, 57] two_children_amount = raw_data.iloc[8, 58] * 1000 three_plus_children_returns = raw_data.iloc[8, 73] three_plus_children_amount = raw_data.iloc[8, 74] * 1000 - assert zero_children_returns == 7636714 - assert zero_children_amount == 2255068000 - - df_long = pd.DataFrame([ - ["0100000US", "children_equal_to", 0, "tax_unit_count", zero_children_returns], - ["0100000US", "children_equal_to", 1, "tax_unit_count", one_child_returns], - ["0100000US", "children_equal_to", 2, "tax_unit_count", two_children_returns], - ["0100000US", "children_greater_or_equal_to", 3, "tax_unit_count", three_plus_children_returns], - ["0100000US", "children_equal_to", 0, "eitc", zero_children_amount], - ["0100000US", "children_equal_to", 1, "eitc", one_child_returns], - ["0100000US", "children_equal_to", 2, "eitc", two_children_returns], - ["0100000US", "children_greater_or_equal_to", 3, "eitc", three_plus_children_returns], - ]) - - df_long.columns = ["ucgid", "constraint", "constraint_value", "variable", "value"] - + assert zero_children_returns == 7636714 + assert zero_children_amount == 2255068000 + + df_long = pd.DataFrame( + [ + [ + "0100000US", + "children_equal_to", + 0, + "tax_unit_count", + zero_children_returns, + ], + [ + "0100000US", + "children_equal_to", + 1, + "tax_unit_count", + one_child_returns, + ], + [ + "0100000US", + "children_equal_to", + 2, + "tax_unit_count", + two_children_returns, + ], + [ + "0100000US", + "children_greater_or_equal_to", + 3, + "tax_unit_count", + three_plus_children_returns, + ], + [ + "0100000US", + "children_equal_to", + 0, + "eitc", + zero_children_amount, + ], + ["0100000US", "children_equal_to", 1, "eitc", one_child_returns], + [ + "0100000US", + "children_equal_to", + 2, + "eitc", + two_children_returns, + ], + [ + "0100000US", + "children_greater_or_equal_to", + 3, + "eitc", + three_plus_children_returns, + ], + ] + ) + + df_long.columns = [ + "ucgid", + "constraint", + "constraint_value", + "variable", + "value", + ] + df_long["period"] = 2020 df_long["reform_id"] = 0 df_long["source_id"] = 2 @@ -81,7 +131,7 @@ def load_eitc_data(df_long): Session = sessionmaker(bind=engine) session = Session() - ucgid = df_long.iloc[0]['ucgid'] + ucgid = df_long.iloc[0]["ucgid"] for num_children in [0, 1, 2, 3]: note = f"eitc_child_count: {num_children}, Geo: {ucgid}" new_stratum = Stratum( @@ -99,30 +149,34 @@ def load_eitc_data(df_long): if num_children <= 2: new_stratum.constraints_rel.append( StratumConstraint( - constraint_variable="eitc_child_count", - operation="equals", - value=str(num_children), + constraint_variable="eitc_child_count", + operation="equals", + value=str(num_children), ), ) elif num_children > 2: new_stratum.constraints_rel.append( StratumConstraint( - constraint_variable="eitc_child_count", - operation="greater_or_equal_than", - value=str(3), + constraint_variable="eitc_child_count", + operation="greater_or_equal_than", + value=str(3), ), ) - rows = df_long.loc[df_long['constraint_value'] == num_children] - count_target = rows.loc[rows.variable == 'tax_unit_count']['value'].values[0] - amount_target = rows.loc[rows.variable == 'eitc']['value'].values[0] + rows = df_long.loc[df_long["constraint_value"] == num_children] + count_target = rows.loc[rows.variable == "tax_unit_count"][ + "value" + ].values[0] + amount_target = rows.loc[rows.variable == "eitc"]["value"].values[0] # Avoiding magic numbers in the load step - count_active = rows.loc[rows.variable == 'tax_unit_count']['active'].values[0] - amount_active = rows.loc[rows.variable == 'eitc']['active'].values[0] + count_active = rows.loc[rows.variable == "tax_unit_count"][ + "active" + ].values[0] + amount_active = rows.loc[rows.variable == "eitc"]["active"].values[0] - period = rows.iloc[0]['period'] - source_id = rows.iloc[0]['source_id'] + period = rows.iloc[0]["period"] + source_id = rows.iloc[0]["source_id"] new_stratum.targets_rel = [ Target( From c2dd4af41b0792866acfe676aa8f15098a26635f Mon Sep 17 00:00:00 2001 From: baogorek Date: Tue, 29 Jul 2025 09:09:18 -0400 Subject: [PATCH 03/27] changelog_entry.yaml --- changelog_entry.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 6331425b..5bd54961 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,5 +1,4 @@ - bump: minor changes: added: - - Added creation script to build relational database for targets - - Refactored age targets load script to load the database + - load script for eitc targets From 95a4a9a48450098c1cfe1a204a7a1dbb0ef4450e Mon Sep 17 00:00:00 2001 From: baogorek Date: Sat, 2 Aug 2025 09:53:51 -0400 Subject: [PATCH 04/27] new file in progress --- policyengine_us_data/db/load_soi_targets.py | 607 ++++++++++++++++++++ 1 file changed, 607 insertions(+) create mode 100644 policyengine_us_data/db/load_soi_targets.py diff --git a/policyengine_us_data/db/load_soi_targets.py b/policyengine_us_data/db/load_soi_targets.py new file mode 100644 index 00000000..74e3dd1d --- /dev/null +++ b/policyengine_us_data/db/load_soi_targets.py @@ -0,0 +1,607 @@ +# This is the file where we actually get the SOI information that we want: + +# Goal: start with raw AGI and EITC: +# Data Dictionary: https://www.irs.gov/pub/irs-soi/22incddocguide.docx +# The Data: https://www.irs.gov/pub/irs-soi/22incd.csv + +from pathlib import Path + +from typing import Optional, Union + +import numpy as np +import pandas as pd +import logging + +from policyengine_us_data.storage import CALIBRATION_FOLDER + +logger = logging.getLogger(__name__) + +"""Utilities to pull AGI targets from the IRS SOI data files.""" + +# Congressional districts have one fewer level than the national and state +# They're missing the million plus category +# ("No AGI Stub") is a specific, intentional category used by the IRS in its summary data files. +SOI_COLUMNS = [ + "Under $1", + "$1 under $10,000", + "$10,000 under $25,000", + "$25,000 under $50,000", + "$50,000 under $75,000", + "$75,000 under $100,000", + "$100,000 under $200,000", + "$200,000 under $500,000", + "$500,000 or more", +] + +AGI_STUB_TO_BAND = {i + 1: band for i, band in enumerate(SOI_COLUMNS)} + +AGI_BOUNDS = { + "Under $1": (-np.inf, 1), + "$1 under $10,000": (1, 10_000), + "$10,000 under $25,000": (10_000, 25_000), + "$25,000 under $50,000": (25_000, 50_000), + "$50,000 under $75,000": (50_000, 75_000), + "$75,000 under $100,000": (75_000, 100_000), + "$100,000 under $200,000": (100_000, 200_000), + "$200,000 under $500,000": (200_000, 500_000), + "$500,000 or more": (500_000, np.inf), +} + +#NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"} + +IGNORE_GEO_IDS = { + "0400000US72", # Puerto Rico (state level) + "5001800US7298", # Puerto Rico + "5001800US6098", # American Samoa + "5001800US6698", # Guam + "5001800US6998", # Northern Mariana Islands + "5001800US7898", # U.S. Virgin Islands +} + +# after skipping the first 7 rows, the national SOI file has targets as row indices [COUNT_INDEX, AMOUNT_INDEX] +NATIONAL_VARIABLES = { + "adjusted_gross_income": [0, 17], +} + +# the state and district SOI file have targets as column names [COUNT_COL_NAME, AMOUNT_COL_NAME] +GEOGRAPHY_VARIABLES = {"adjusted_gross_income": ["N1", "A00100"]} + +STATE_ABBR_TO_FIPS = { + "AL": "01", + "AK": "02", + "AZ": "04", + "AR": "05", + "CA": "06", + "CO": "08", + "CT": "09", + "DC": "11", + "DE": "10", + "FL": "12", + "GA": "13", + "HI": "15", + "ID": "16", + "IL": "17", + "IN": "18", + "IA": "19", + "KS": "20", + "KY": "21", + "LA": "22", + "ME": "23", + "MD": "24", + "MA": "25", + "MI": "26", + "MN": "27", + "MS": "28", + "MO": "29", + "MT": "30", + "NE": "31", + "NV": "32", + "NH": "33", + "NJ": "34", + "NM": "35", + "NY": "36", + "NC": "37", + "ND": "38", + "OH": "39", + "OK": "40", + "OR": "41", + "PA": "42", + "RI": "44", + "SC": "45", + "SD": "46", + "TN": "47", + "TX": "48", + "UT": "49", + "VT": "50", + "VA": "51", + "WA": "53", + "WV": "54", + "WI": "55", + "WY": "56", +} +FIPS_TO_STATE_ABBR = {v: k for k, v in STATE_ABBR_TO_FIPS.items()} + + +def pull_national_soi_variable( + soi_variable_ident: int, # the national SOI xlsx file has a row for each target variable + variable_name: Union[str, None], + is_count: bool, + national_df: Optional[pd.DataFrame] = None, +) -> pd.DataFrame: + """Download and save national AGI totals.""" + df = pd.read_excel( + "https://www.irs.gov/pub/irs-soi/22in54us.xlsx", skiprows=7 + ) + + assert ( + np.abs( + df.iloc[soi_variable_ident, 1] + - df.iloc[soi_variable_ident, 2:12].sum() + ) + < 100 + ), "Row 0 doesn't add up — check the file." + + agi_values = df.iloc[soi_variable_ident, 2:12].astype(int).to_numpy() + agi_values = np.concatenate( + [agi_values[:8], [agi_values[8] + agi_values[9]]] + ) + + agi_brackets = [ + AGI_STUB_TO_BAND[i] for i in range(1, len(SOI_COLUMNS) + 1) + ] + + result = pd.DataFrame( + { + "GEO_ID": ["0100000US"] * len(agi_brackets), + "GEO_NAME": ["national"] * len(agi_brackets), + "LOWER_BOUND": [AGI_BOUNDS[b][0] for b in agi_brackets], + "UPPER_BOUND": [AGI_BOUNDS[b][1] for b in agi_brackets], + "VALUE": agi_values, + } + ) + + # final column order + result = result[ + ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] + ] + result["IS_COUNT"] = int(is_count) + result["VARIABLE"] = variable_name + + result["VALUE"] = np.where( + result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"] + ) + + if national_df is not None: + # If a DataFrame is passed, we append the new data to it. + df = pd.concat([national_df, result], ignore_index=True) + return df + + return result + + +def pull_state_soi_variable( + soi_variable_ident: str, # the state SOI csv file has a column for each target variable + variable_name: Union[str, None], + is_count: bool, + state_df: Optional[pd.DataFrame] = None, +) -> pd.DataFrame: + """Download and save state AGI totals.""" + df = pd.read_csv( + "https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv", thousands="," + ) + + merged = ( + df[df["AGI_STUB"].isin([9, 10])] + .groupby("STATE", as_index=False) + .agg({soi_variable_ident: "sum"}) + .assign(AGI_STUB=9) + ) + df = df[~df["AGI_STUB"].isin([9, 10])] + df = pd.concat([df, merged], ignore_index=True) + df = df[df["AGI_STUB"] != 0] + + df["agi_bracket"] = df["AGI_STUB"].map(AGI_STUB_TO_BAND) + + df["state_abbr"] = df["STATE"] + df["GEO_ID"] = "0400000US" + df["state_abbr"].map(STATE_ABBR_TO_FIPS) + df["GEO_NAME"] = "state_" + df["state_abbr"] + + result = df.loc[ + ~df["STATE"].isin(NON_VOTING_STATES.union({"US"})), + ["GEO_ID", "GEO_NAME", "agi_bracket", soi_variable_ident], + ].rename(columns={soi_variable_ident: "VALUE"}) + + result["LOWER_BOUND"] = result["agi_bracket"].map( + lambda b: AGI_BOUNDS[b][0] + ) + result["UPPER_BOUND"] = result["agi_bracket"].map( + lambda b: AGI_BOUNDS[b][1] + ) + + # final column order + result = result[ + ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] + ] + result["IS_COUNT"] = int(is_count) + result["VARIABLE"] = variable_name + + result["VALUE"] = np.where( + result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"] + ) + + if state_df is not None: + # If a DataFrame is passed, we append the new data to it. + df = pd.concat([state_df, result], ignore_index=True) + return df + + return result + + +def extract_soi_data() -> pd.DataFrame: + """Download and save congressional district AGI totals. + + In the file below, "22" is 2022, "in" is individual returns, + "cd" is congressional districts + + """ + return pd.read_csv("https://www.irs.gov/pub/irs-soi/22incd.csv") + + +raw_df = df +# a "stub" is a term the IRS uses for a predefined category or group, specifically an income bracket. + +def transform_soi_data(raw_df) + + # agi_stub is only 0, so there are only agi breakdowns at the state level + # So you can confirm summability for 0 and then forget that national exists + # Honestly I think that's a better idea in general. If your states don't add + # Up to your national, something's off and you should treat it as an immediate + # problem to fix rather than something to be adjusted + national_df = raw_df.copy().loc[ + (raw_df.STATE == "US") + ] + + # You've got agi_stub == 0 in here, which you want to use any time you don't want to + # break things up by AGI + state_df = raw_df.copy().loc[ + (raw_df.STATE != "US") & + (raw_df.CONG_DISTRICT == 0) + ] + + # This is going to fail because we're missing the single cong district states + district_df = raw_df.copy().loc[ + (raw_df.CONG_DISTRICT > 0) + ] + + max_cong_district_by_state = raw_df.groupby('STATE')['CONG_DISTRICT'].transform('max') + district_df = raw_df.copy().loc[ + (raw_df['CONG_DISTRICT'] > 0) | (max_cong_district_by_state == 0) + ] + district_df = district_df.loc[district_df['STATE'] != 'US'] + + assert district_df.shape[0] % 436 == 0 + + # And you've got everything you need for all 3 levels of targets from this guy + + # So I want to get 2 variable categories out of this thing, in long format + # 1) EITC, and 2) AGI + # There's eitc_child_count, eitc. There's person_count and tax_unit_count + # but no household_count. That's why you're doing this though, for a great example + # Wide (a new variable per number of children) or Long (breakdown variable is number of children) + + + district_df["STATEFIPS"] = district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2) + district_df["CONG_DISTRICT"] = ( + district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2) + ) + district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"] + + district_df = district_df[~district_df["ucgid_str"].isin(IGNORE_GEO_IDS)] + + # eitc: you'll only want to take agi_stub = 0 cases + + district_marginals = district_df.copy().loc[district_df.agi_stub == 0] + assert district_marginals.shape[0] == 436 + + eitc_no_children = district_marginals.copy()[['ucgid_str', 'N59661', 'A59661']].rename({ + 'N59661': 'tax_unit_count', + 'A59661': 'eitc' + }, axis = 1) + + eitc_no_children['eitc_children'] = 0 + + eitc_one_child = district_marginals.copy()[['ucgid_str', 'N59662', 'A59662']].rename({ + 'N59662': 'tax_unit_count', + 'A59662': 'eitc' + }, axis=1) + eitc_one_child['eitc_children'] = 1 + + eitc_two_children = district_marginals.copy()[['ucgid_str', 'N59663', 'A59663']].rename({ + 'N59663': 'tax_unit_count', + 'A59663': 'eitc' + }, axis=1) + eitc_two_children['eitc_children'] = 2 + + eitc_three_plus_children = district_marginals.copy()[['ucgid_str', 'N59664', 'A59664']].rename({ + 'N59664': 'tax_unit_count', + 'A59664': 'eitc' + }, axis=1) + eitc_three_plus_children['eitc_children'] = '3+' + + # Question: so many: why do this processing at the district level, since the structure is the same all over? + # OR, is it? At least the renaming is. + # Keep going for now and see how much you can generalize + + + + + at_large_states = ( + district_df.groupby("STATEFIPS")["CONG_DISTRICT"] + .nunique() + .pipe(lambda s: s[s == 1].index) + ) + district_df = district_df.loc[ + (district_df["CONG_DISTRICT"] != "00") | (district_df["STATEFIPS"].isin(at_large_states)) + ].reset_index(drop=True) + + district_df["GEO_NAME"] = "district_" + ( + f"{district_df['STATEFIPS'].map(FIPS_TO_STATE_ABBR)}-{district_df['CONG_DISTRICT']}" + ) + + district_df["agi_bracket"] = district_df["agi_stub"].map(AGI_STUB_TO_BAND) + + district_df + + result = df[ + [ + "GEO_ID", + "GEO_NAME", + "CONG_DISTRICT", + "STATE", + "agi_bracket", + soi_variable_ident, + ] + ].rename(columns={soi_variable_ident: "VALUE"}) + + result["LOWER_BOUND"] = result["agi_bracket"].map( + lambda b: AGI_BOUNDS[b][0] + ) + result["UPPER_BOUND"] = result["agi_bracket"].map( + lambda b: AGI_BOUNDS[b][1] + ) + + # if redistrict: + # result = apply_redistricting(result, variable_name) + + assert df["GEO_ID"].nunique() == 436 + + if redistrict: + # After redistricting, validate against the new district codes from the mapping + mapping_df = pd.read_csv(CALIBRATION_FOLDER / "district_mapping.csv") + valid_district_codes = set(mapping_df["code_new"].unique()) + + # Check that all GEO_IDs are valid + produced_codes = set(result["GEO_ID"]) + invalid_codes = produced_codes - valid_district_codes + assert ( + not invalid_codes + ), f"Invalid district codes after redistricting: {invalid_codes}" + + # Check we have exactly 436 districts + assert ( + len(produced_codes) == 436 + ), f"Expected 436 districts after redistricting, got {len(produced_codes)}" + + # Check that all GEO_IDs successfully mapped to names + missing_names = result[result["GEO_NAME"].isna()]["GEO_ID"].unique() + assert ( + len(missing_names) == 0 + ), f"GEO_IDs without names in ID_TO_NAME mapping: {missing_names}" + + # final column order + result = result[ + ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] + ] + result["IS_COUNT"] = int(is_count) + result["VARIABLE"] = variable_name + + result["VALUE"] = np.where( + result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"] + ) + + if district_df is not None: + # If a DataFrame is passed, we append the new data to it. + df = pd.concat([district_df, result], ignore_index=True) + return df + + return result + + +def _get_soi_data(geo_level: str) -> pd.DataFrame: + """ + geo_level ∈ {'National', 'State', 'District'} + Returns a DataFrame with all SOI variables for the specified geography level + """ + if geo_level == "National": + var_indices = NATIONAL_VARIABLES + variable_pull = pull_national_soi_variable + elif geo_level == "State": + var_indices = GEOGRAPHY_VARIABLES + variable_pull = pull_state_soi_variable + elif geo_level == "District": + var_indices = GEOGRAPHY_VARIABLES + variable_pull = pull_district_soi_variable + else: + raise ValueError("geo_level must be National, State or District") + + df = pd.DataFrame() + for variable, identifiers in var_indices.items(): + count_id, amount_id = identifiers + # Pull count data (first identifier) + count_df = variable_pull( + soi_variable_ident=count_id, + variable_name=variable, + is_count=float(True), + ) + df = pd.concat([df, count_df], ignore_index=True) + # Pull amount data (second identifier) + amount_df = variable_pull( + soi_variable_ident=amount_id, + variable_name=variable, + is_count=float(False), + ) + df = pd.concat([df, amount_df], ignore_index=True) + + return df + + +def combine_geography_levels(districts: Optional[bool] = False) -> None: + """Combine SOI data across geography levels with validation and rescaling.""" + national = _get_soi_data("National") + state = _get_soi_data("State") + if districts: + district = _get_soi_data("District") + + # Add state FIPS codes for validation + state["STATEFIPS"] = state["GEO_ID"].str[-2:] + if districts: + district["STATEFIPS"] = district["GEO_ID"].str[-4:-2] + + # Get unique variables and AGI brackets for iteration + variables = national["VARIABLE"].unique() + agi_brackets = national[["LOWER_BOUND", "UPPER_BOUND"]].drop_duplicates() + + # Validate and rescale state totals against national totals + for variable in variables: + for is_count in [0.0, 1.0]: # Process count and amount separately + for _, bracket in agi_brackets.iterrows(): + lower, upper = ( + bracket["LOWER_BOUND"], + bracket["UPPER_BOUND"], + ) + + # Get national total for this variable/bracket/type combination + nat_mask = ( + (national["VARIABLE"] == variable) + & (national["LOWER_BOUND"] == lower) + & (national["UPPER_BOUND"] == upper) + & (national["IS_COUNT"] == is_count) + ) + us_total = national.loc[nat_mask, "VALUE"].iloc[0] + + # Get state total for this variable/bracket/type combination + state_mask = ( + (state["VARIABLE"] == variable) + & (state["LOWER_BOUND"] == lower) + & (state["UPPER_BOUND"] == upper) + & (state["IS_COUNT"] == is_count) + ) + state_total = state.loc[state_mask, "VALUE"].sum() + + # Rescale states if they don't match national total + if not np.isclose(state_total, us_total, rtol=1e-3): + count_type = "count" if is_count == 1.0 else "amount" + logger.warning( + f"States' sum does not match national total for {variable}/{count_type} " + f"in bracket [{lower}, {upper}]. Rescaling state targets." + ) + state.loc[state_mask, "VALUE"] *= us_total / state_total + + if districts: + # Validate and rescale district totals against state totals + for variable in variables: + for is_count in [0.0, 1.0]: # Process count and amount separately + for _, bracket in agi_brackets.iterrows(): + lower, upper = ( + bracket["LOWER_BOUND"], + bracket["UPPER_BOUND"], + ) + + # Create masks for this variable/bracket/type combination + state_mask = ( + (state["VARIABLE"] == variable) + & (state["LOWER_BOUND"] == lower) + & (state["UPPER_BOUND"] == upper) + & (state["IS_COUNT"] == is_count) + ) + district_mask = ( + (district["VARIABLE"] == variable) + & (district["LOWER_BOUND"] == lower) + & (district["UPPER_BOUND"] == upper) + & (district["IS_COUNT"] == is_count) + ) + + # Get state totals indexed by STATEFIPS + state_totals = state.loc[state_mask].set_index("STATEFIPS")[ + "VALUE" + ] + + # Get district totals grouped by STATEFIPS + district_totals = ( + district.loc[district_mask] + .groupby("STATEFIPS")["VALUE"] + .sum() + ) + + # Check and rescale districts for each state + for fips, d_total in district_totals.items(): + s_total = state_totals.get(fips) + + if s_total is not None and not np.isclose( + d_total, s_total, rtol=1e-3 + ): + count_type = "count" if is_count == 1.0 else "amount" + logger.warning( + f"Districts' sum does not match {fips} state total for {variable}/{count_type} " + f"in bracket [{lower}, {upper}]. Rescaling district targets." + ) + rescale_mask = district_mask & ( + district["STATEFIPS"] == fips + ) + district.loc[rescale_mask, "VALUE"] *= ( + s_total / d_total + ) + + # Combine all data + combined = pd.concat( + [ + national, + state.drop(columns="STATEFIPS"), + ( + district.drop(columns="STATEFIPS") + if districts + else pd.DataFrame(columns=national.columns) + ), + ], + ignore_index=True, + ).sort_values(["GEO_ID", "VARIABLE", "LOWER_BOUND"]) + + combined["DATA_SOURCE"] = "soi" + combined["BREAKDOWN_VARIABLE"] = "adjusted_gross_income" + + combined = combined[ + [ + "DATA_SOURCE", + "GEO_ID", + "GEO_NAME", + "VARIABLE", + "VALUE", + "IS_COUNT", + "BREAKDOWN_VARIABLE", + "LOWER_BOUND", + "UPPER_BOUND", + ] + ] + + # Save combined data + out_path = CALIBRATION_FOLDER / "soi.csv" + combined.to_csv(out_path, index=False) + logger.info(f"Combined SOI targets saved to {out_path}") + + +def main() -> None: + combine_geography_levels() + + +if __name__ == "__main__": + main() From 6fd3542998cbec24cb0cf2d096840a7e51715eee Mon Sep 17 00:00:00 2001 From: baogorek Date: Wed, 6 Aug 2025 23:39:41 -0400 Subject: [PATCH 05/27] moving to QBID and SALT --- policyengine_us_data/db/load_age_targets.py | 22 +- policyengine_us_data/db/load_soi_targets.py | 416 ++++++++++---------- 2 files changed, 219 insertions(+), 219 deletions(-) diff --git a/policyengine_us_data/db/load_age_targets.py b/policyengine_us_data/db/load_age_targets.py index b93c7687..b588c922 100644 --- a/policyengine_us_data/db/load_age_targets.py +++ b/policyengine_us_data/db/load_age_targets.py @@ -174,18 +174,18 @@ def transform_age_data(age_data, docs): ) df = df.drop(columns="NAME") - df = df.rename({"GEO_ID": "ucgid"}, axis=1) - df_data = df.rename(columns=rename_mapping)[["ucgid"] + list(AGE_COLS)] + df = df.rename({"GEO_ID": "ucgid_str"}, axis=1) + df_data = df.rename(columns=rename_mapping)[["ucgid_str"] + list(AGE_COLS)] # Filter out Puerto Rico's district and state records, if needed df_geos = df_data[ - ~df_data["ucgid"].isin(["5001800US7298", "0400000US72"]) + ~df_data["ucgid_str"].isin(["5001800US7298", "0400000US72"]) ].copy() - df = df_geos[["ucgid"] + AGE_COLS] + df = df_geos[["ucgid_str"] + AGE_COLS] df_long = df.melt( - id_vars="ucgid", + id_vars="ucgid_str", value_vars=AGE_COLS, var_name="age_range", value_name="value", @@ -212,11 +212,11 @@ def load_age_data(df_long, geo, stratum_lookup={}): # Quick data quality check before loading ---- if geo == "National": - assert len(set(df_long.ucgid)) == 1 + assert len(set(df_long.ucgid_str)) == 1 elif geo == "State": - assert len(set(df_long.ucgid)) == 51 + assert len(set(df_long.ucgid_str)) == 51 elif geo == "District": - assert len(set(df_long.ucgid)) == 436 + assert len(set(df_long.ucgid_str)) == 436 else: raise ValueError('geo must be one of "National", "State", "District"') @@ -238,7 +238,7 @@ def load_age_data(df_long, geo, stratum_lookup={}): # Create the parent Stratum object. # We will attach children to it before adding it to the session. - note = f"Age: {row['age_range']}, Geo: {row['ucgid']}" + note = f"Age: {row['age_range']}, Geo: {row['ucgid_str']}" parent_geo = get_parent_geo(geo) parent_stratum_id = ( stratum_lookup[parent_geo][row["age_range"]] @@ -253,9 +253,9 @@ def load_age_data(df_long, geo, stratum_lookup={}): # Create constraints and link them to the parent's relationship attribute. new_stratum.constraints_rel = [ StratumConstraint( - constraint_variable="ucgid", + constraint_variable="ucgid_str", operation="equals", - value=row["ucgid"], + value=row["ucgid_str"], ), StratumConstraint( constraint_variable="age", diff --git a/policyengine_us_data/db/load_soi_targets.py b/policyengine_us_data/db/load_soi_targets.py index 74e3dd1d..b564bbf8 100644 --- a/policyengine_us_data/db/load_soi_targets.py +++ b/policyengine_us_data/db/load_soi_targets.py @@ -122,119 +122,128 @@ FIPS_TO_STATE_ABBR = {v: k for k, v in STATE_ABBR_TO_FIPS.items()} -def pull_national_soi_variable( - soi_variable_ident: int, # the national SOI xlsx file has a row for each target variable - variable_name: Union[str, None], - is_count: bool, - national_df: Optional[pd.DataFrame] = None, -) -> pd.DataFrame: - """Download and save national AGI totals.""" - df = pd.read_excel( - "https://www.irs.gov/pub/irs-soi/22in54us.xlsx", skiprows=7 - ) - - assert ( - np.abs( - df.iloc[soi_variable_ident, 1] - - df.iloc[soi_variable_ident, 2:12].sum() - ) - < 100 - ), "Row 0 doesn't add up — check the file." - - agi_values = df.iloc[soi_variable_ident, 2:12].astype(int).to_numpy() - agi_values = np.concatenate( - [agi_values[:8], [agi_values[8] + agi_values[9]]] - ) - - agi_brackets = [ - AGI_STUB_TO_BAND[i] for i in range(1, len(SOI_COLUMNS) + 1) - ] - - result = pd.DataFrame( - { - "GEO_ID": ["0100000US"] * len(agi_brackets), - "GEO_NAME": ["national"] * len(agi_brackets), - "LOWER_BOUND": [AGI_BOUNDS[b][0] for b in agi_brackets], - "UPPER_BOUND": [AGI_BOUNDS[b][1] for b in agi_brackets], - "VALUE": agi_values, - } - ) - - # final column order - result = result[ - ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] - ] - result["IS_COUNT"] = int(is_count) - result["VARIABLE"] = variable_name - - result["VALUE"] = np.where( - result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"] - ) - - if national_df is not None: - # If a DataFrame is passed, we append the new data to it. - df = pd.concat([national_df, result], ignore_index=True) - return df - - return result - - -def pull_state_soi_variable( - soi_variable_ident: str, # the state SOI csv file has a column for each target variable - variable_name: Union[str, None], - is_count: bool, - state_df: Optional[pd.DataFrame] = None, -) -> pd.DataFrame: - """Download and save state AGI totals.""" - df = pd.read_csv( - "https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv", thousands="," - ) - - merged = ( - df[df["AGI_STUB"].isin([9, 10])] - .groupby("STATE", as_index=False) - .agg({soi_variable_ident: "sum"}) - .assign(AGI_STUB=9) - ) - df = df[~df["AGI_STUB"].isin([9, 10])] - df = pd.concat([df, merged], ignore_index=True) - df = df[df["AGI_STUB"] != 0] - - df["agi_bracket"] = df["AGI_STUB"].map(AGI_STUB_TO_BAND) - - df["state_abbr"] = df["STATE"] - df["GEO_ID"] = "0400000US" + df["state_abbr"].map(STATE_ABBR_TO_FIPS) - df["GEO_NAME"] = "state_" + df["state_abbr"] - - result = df.loc[ - ~df["STATE"].isin(NON_VOTING_STATES.union({"US"})), - ["GEO_ID", "GEO_NAME", "agi_bracket", soi_variable_ident], - ].rename(columns={soi_variable_ident: "VALUE"}) - - result["LOWER_BOUND"] = result["agi_bracket"].map( - lambda b: AGI_BOUNDS[b][0] - ) - result["UPPER_BOUND"] = result["agi_bracket"].map( - lambda b: AGI_BOUNDS[b][1] - ) - - # final column order - result = result[ - ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] - ] - result["IS_COUNT"] = int(is_count) - result["VARIABLE"] = variable_name - - result["VALUE"] = np.where( - result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"] - ) - - if state_df is not None: - # If a DataFrame is passed, we append the new data to it. - df = pd.concat([state_df, result], ignore_index=True) - return df - - return result +#def pull_national_soi_variable( +# soi_variable_ident: int, # the national SOI xlsx file has a row for each target variable +# variable_name: Union[str, None], +# is_count: bool, +# national_df: Optional[pd.DataFrame] = None, +#) -> pd.DataFrame: +# """Download and save national AGI totals.""" +# df = pd.read_excel( +# "https://www.irs.gov/pub/irs-soi/22in54us.xlsx", skiprows=7 +# ) +# +# assert ( +# np.abs( +# df.iloc[soi_variable_ident, 1] +# - df.iloc[soi_variable_ident, 2:12].sum() +# ) +# < 100 +# ), "Row 0 doesn't add up — check the file." +# +# agi_values = df.iloc[soi_variable_ident, 2:12].astype(int).to_numpy() +# agi_values = np.concatenate( +# [agi_values[:8], [agi_values[8] + agi_values[9]]] +# ) +# +# agi_brackets = [ +# AGI_STUB_TO_BAND[i] for i in range(1, len(SOI_COLUMNS) + 1) +# ] +# +# result = pd.DataFrame( +# { +# "GEO_ID": ["0100000US"] * len(agi_brackets), +# "GEO_NAME": ["national"] * len(agi_brackets), +# "LOWER_BOUND": [AGI_BOUNDS[b][0] for b in agi_brackets], +# "UPPER_BOUND": [AGI_BOUNDS[b][1] for b in agi_brackets], +# "VALUE": agi_values, +# } +# ) +# +# # final column order +# result = result[ +# ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] +# ] +# result["IS_COUNT"] = int(is_count) +# result["VARIABLE"] = variable_name +# +# result["VALUE"] = np.where( +# result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"] +# ) +# +# if national_df is not None: +# # If a DataFrame is passed, we append the new data to it. +# df = pd.concat([national_df, result], ignore_index=True) +# return df +# +# return result +# +# +#def pull_state_soi_variable( +# soi_variable_ident: str, # the state SOI csv file has a column for each target variable +# variable_name: Union[str, None], +# is_count: bool, +# state_df: Optional[pd.DataFrame] = None, +#) -> pd.DataFrame: +# """Download and save state AGI totals.""" +# df = pd.read_csv( +# "https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv", thousands="," +# ) +# +# merged = ( +# df[df["AGI_STUB"].isin([9, 10])] +# .groupby("STATE", as_index=False) +# .agg({soi_variable_ident: "sum"}) +# .assign(AGI_STUB=9) +# ) +# df = df[~df["AGI_STUB"].isin([9, 10])] +# df = pd.concat([df, merged], ignore_index=True) +# df = df[df["AGI_STUB"] != 0] +# +# df["agi_bracket"] = df["AGI_STUB"].map(AGI_STUB_TO_BAND) +# +# df["state_abbr"] = df["STATE"] +# df["GEO_ID"] = "0400000US" + df["state_abbr"].map(STATE_ABBR_TO_FIPS) +# df["GEO_NAME"] = "state_" + df["state_abbr"] +# +# result = df.loc[ +# ~df["STATE"].isin(NON_VOTING_STATES.union({"US"})), +# ["GEO_ID", "GEO_NAME", "agi_bracket", soi_variable_ident], +# ].rename(columns={soi_variable_ident: "VALUE"}) +# +# result["LOWER_BOUND"] = result["agi_bracket"].map( +# lambda b: AGI_BOUNDS[b][0] +# ) +# result["UPPER_BOUND"] = result["agi_bracket"].map( +# lambda b: AGI_BOUNDS[b][1] +# ) +# +# # final column order +# result = result[ +# ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] +# ] +# result["IS_COUNT"] = int(is_count) +# result["VARIABLE"] = variable_name +# +# result["VALUE"] = np.where( +# result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"] +# ) +# +# if state_df is not None: +# # If a DataFrame is passed, we append the new data to it. +# df = pd.concat([state_df, result], ignore_index=True) +# return df +# +# return result + +def create_records(df, breakdown_variable, target_variable): + """Transforms a DataFrame subset into a standardized list of records.""" + temp_df = df[["ucgid_str"]].copy() + temp_df["breakdown_variable"] = breakdown_variable + temp_df["breakdown_value"] = df[breakdown_variable] + temp_df["target_variable"] = target_variable + temp_df["target_value"] = df[target_variable] + return temp_df def extract_soi_data() -> pd.DataFrame: @@ -242,16 +251,16 @@ def extract_soi_data() -> pd.DataFrame: In the file below, "22" is 2022, "in" is individual returns, "cd" is congressional districts - """ return pd.read_csv("https://www.irs.gov/pub/irs-soi/22incd.csv") -raw_df = df +raw_df = extract_soi_data() # a "stub" is a term the IRS uses for a predefined category or group, specifically an income bracket. def transform_soi_data(raw_df) + # agi_stub is only 0, so there are only agi breakdowns at the state level # So you can confirm summability for 0 and then forget that national exists # Honestly I think that's a better idea in general. If your states don't add @@ -260,6 +269,7 @@ def transform_soi_data(raw_df) national_df = raw_df.copy().loc[ (raw_df.STATE == "US") ] + national_df["ucgid_str"] = "0100000US" # You've got agi_stub == 0 in here, which you want to use any time you don't want to # break things up by AGI @@ -267,6 +277,7 @@ def transform_soi_data(raw_df) (raw_df.STATE != "US") & (raw_df.CONG_DISTRICT == 0) ] + state_df["ucgid_str"] = "0400000US" + state_df["STATEFIPS"].astype(str).str.zfill(2) # This is going to fail because we're missing the single cong district states district_df = raw_df.copy().loc[ @@ -278,10 +289,21 @@ def transform_soi_data(raw_df) (raw_df['CONG_DISTRICT'] > 0) | (max_cong_district_by_state == 0) ] district_df = district_df.loc[district_df['STATE'] != 'US'] + district_df["STATEFIPS"] = district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2) + district_df["CONG_DISTRICT"] = ( + district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2) + ) + district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"] + district_df = district_df[~district_df["ucgid_str"].isin(IGNORE_GEO_IDS)] assert district_df.shape[0] % 436 == 0 - # And you've got everything you need for all 3 levels of targets from this guy + # And you've got everything you need for all 3 levels of targets: + # 1. national_df + # 2. state_df + # 3. district_df + + all_df = pd.concat([national_df, state_df, district_df]) # So I want to get 2 variable categories out of this thing, in long format # 1) EITC, and 2) AGI @@ -289,132 +311,110 @@ def transform_soi_data(raw_df) # but no household_count. That's why you're doing this though, for a great example # Wide (a new variable per number of children) or Long (breakdown variable is number of children) + # Marginal in terms of AGI, which this data set is organized with respect to + all_marginals = all_df.copy().loc[all_df.agi_stub == 0] + assert all_marginals.shape[0] == 436 + 51 + 1 - district_df["STATEFIPS"] = district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2) - district_df["CONG_DISTRICT"] = ( - district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2) - ) - district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"] - - district_df = district_df[~district_df["ucgid_str"].isin(IGNORE_GEO_IDS)] + # Collect targets from the SOI file + records = [] - # eitc: you'll only want to take agi_stub = 0 cases - - district_marginals = district_df.copy().loc[district_df.agi_stub == 0] - assert district_marginals.shape[0] == 436 - - eitc_no_children = district_marginals.copy()[['ucgid_str', 'N59661', 'A59661']].rename({ + # EITC --------------------------------------------------------------------------- + eitc_no_children = all_marginals.copy().rename({ 'N59661': 'tax_unit_count', 'A59661': 'eitc' }, axis = 1) - eitc_no_children['eitc_children'] = 0 - eitc_one_child = district_marginals.copy()[['ucgid_str', 'N59662', 'A59662']].rename({ + records.append( + create_records(eitc_no_children, "eitc_children", "tax_unit_count") + ) + records.append( + create_records(eitc_no_children, "eitc_children", "eitc") + ) + + eitc_one_child = all_marginals.copy().rename({ 'N59662': 'tax_unit_count', 'A59662': 'eitc' }, axis=1) eitc_one_child['eitc_children'] = 1 - - eitc_two_children = district_marginals.copy()[['ucgid_str', 'N59663', 'A59663']].rename({ + + records.append( + create_records(eitc_one_child, "eitc_children", "tax_unit_count") + ) + records.append( + create_records(eitc_one_child, "eitc_children", "eitc") + ) + + eitc_two_children = all_marginals.copy().rename({ 'N59663': 'tax_unit_count', 'A59663': 'eitc' }, axis=1) eitc_two_children['eitc_children'] = 2 - - eitc_three_plus_children = district_marginals.copy()[['ucgid_str', 'N59664', 'A59664']].rename({ + + records.append( + create_records(eitc_two_children, "eitc_children", "tax_unit_count") + ) + records.append( + create_records(eitc_two_children, "eitc_children", "eitc") + ) + + eitc_three_plus_children = all_marginals.copy().rename({ 'N59664': 'tax_unit_count', 'A59664': 'eitc' }, axis=1) eitc_three_plus_children['eitc_children'] = '3+' - # Question: so many: why do this processing at the district level, since the structure is the same all over? - # OR, is it? At least the renaming is. - # Keep going for now and see how much you can generalize + records.append( + create_records(eitc_three_plus_children, "eitc_children", "tax_unit_count") + ) + records.append( + create_records(eitc_three_plus_children, "eitc_children", "eitc") + ) + # QBID ---------------------------------------------------------------------- + qbid = all_marginals.copy().rename({ + 'N59664': 'tax_unit_count', + 'A59664': 'qbid' + }, axis=1) + # No breakdown variable other than the geo here + qbid['one'] = 1 + records.append( + create_records(qbid, "one", "tax_unit_count") + ) + records.append( + create_records(qbid, "one", "qbid") + ) + # SALT ----------------------------------------------------------------------- - at_large_states = ( - district_df.groupby("STATEFIPS")["CONG_DISTRICT"] - .nunique() - .pipe(lambda s: s[s == 1].index) - ) - district_df = district_df.loc[ - (district_df["CONG_DISTRICT"] != "00") | (district_df["STATEFIPS"].isin(at_large_states)) - ].reset_index(drop=True) + # TODO: THERE's definitely a pattern here + # TODO: you forgot to multiply by 1000! + # For all the files, the money amounts are reported in thousands of dollars. + salt = all_marginals.copy().rename({ + 'N18425': 'tax_unit_count', + 'A18425': 'salt' + }, axis=1) + salt['one'] = 1 - district_df["GEO_NAME"] = "district_" + ( - f"{district_df['STATEFIPS'].map(FIPS_TO_STATE_ABBR)}-{district_df['CONG_DISTRICT']}" + records.append( + create_records(salt, "one", "tax_unit_count") + ) + records.append( + create_records(qbid, "one", "salt") ) - district_df["agi_bracket"] = district_df["agi_stub"].map(AGI_STUB_TO_BAND) - district_df + return records + + + - result = df[ - [ - "GEO_ID", - "GEO_NAME", - "CONG_DISTRICT", - "STATE", - "agi_bracket", - soi_variable_ident, - ] - ].rename(columns={soi_variable_ident: "VALUE"}) - result["LOWER_BOUND"] = result["agi_bracket"].map( - lambda b: AGI_BOUNDS[b][0] - ) - result["UPPER_BOUND"] = result["agi_bracket"].map( - lambda b: AGI_BOUNDS[b][1] - ) - # if redistrict: - # result = apply_redistricting(result, variable_name) - - assert df["GEO_ID"].nunique() == 436 - - if redistrict: - # After redistricting, validate against the new district codes from the mapping - mapping_df = pd.read_csv(CALIBRATION_FOLDER / "district_mapping.csv") - valid_district_codes = set(mapping_df["code_new"].unique()) - - # Check that all GEO_IDs are valid - produced_codes = set(result["GEO_ID"]) - invalid_codes = produced_codes - valid_district_codes - assert ( - not invalid_codes - ), f"Invalid district codes after redistricting: {invalid_codes}" - - # Check we have exactly 436 districts - assert ( - len(produced_codes) == 436 - ), f"Expected 436 districts after redistricting, got {len(produced_codes)}" - - # Check that all GEO_IDs successfully mapped to names - missing_names = result[result["GEO_NAME"].isna()]["GEO_ID"].unique() - assert ( - len(missing_names) == 0 - ), f"GEO_IDs without names in ID_TO_NAME mapping: {missing_names}" - - # final column order - result = result[ - ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] - ] - result["IS_COUNT"] = int(is_count) - result["VARIABLE"] = variable_name - result["VALUE"] = np.where( - result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"] - ) - if district_df is not None: - # If a DataFrame is passed, we append the new data to it. - df = pd.concat([district_df, result], ignore_index=True) - return df - return result def _get_soi_data(geo_level: str) -> pd.DataFrame: From c73ef870ca30da35dd7f3fa511ec13db517af359 Mon Sep 17 00:00:00 2001 From: baogorek Date: Thu, 7 Aug 2025 13:50:48 -0400 Subject: [PATCH 06/27] new variables added --- policyengine_us_data/db/load_soi_targets.py | 249 ++++++++++++-------- 1 file changed, 157 insertions(+), 92 deletions(-) diff --git a/policyengine_us_data/db/load_soi_targets.py b/policyengine_us_data/db/load_soi_targets.py index b564bbf8..2fe3fa91 100644 --- a/policyengine_us_data/db/load_soi_targets.py +++ b/policyengine_us_data/db/load_soi_targets.py @@ -5,8 +5,7 @@ # The Data: https://www.irs.gov/pub/irs-soi/22incd.csv from pathlib import Path - -from typing import Optional, Union +from typing import List, Optional, Sequence, Dict, Tuple, Any, Union import numpy as np import pandas as pd @@ -246,6 +245,97 @@ def create_records(df, breakdown_variable, target_variable): return temp_df +def make_records( + df: pd.DataFrame, + *, + count_col: str, + amount_col: str, + amount_name: str, + breakdown_col: Optional[str] = None, + multiplier: int = 1_000, +): + df = ( + df.rename({count_col: "tax_unit_count", + amount_col: amount_name}, + axis=1) + .copy() + ) + + if breakdown_col is None: + breakdown_col = "one" + df[breakdown_col] = 1 + + rec_counts = create_records(df, breakdown_col, "tax_unit_count") + rec_amounts = create_records(df, breakdown_col, amount_name) + rec_amounts["target_value"] *= multiplier # Only the amounts get * 1000 + rec_counts["target_variable"] = f"{amount_name}_tax_unit_count" + + return rec_counts, rec_amounts + + + +_TARGET_COL_MAP = { + "N1": "agi_tax_unit_count", # number of returns (≈ “tax units”) + "N2": "agi_person_count", # number of individuals + "A00100": "agi_total_amount", # total Adjusted Gross Income +} + +_BREAKDOWN_FIELD = "agi_stub" # numeric AGI stub 1‑10 from IRS +_BREAKDOWN_NAME = "agi_stub" # what will go in `breakdown_variable` + +def make_agi_long(df: pd.DataFrame) -> pd.DataFrame: + """ + Convert IRS SOI AGI‑split table from wide to the long format used + in your `records[*]` list. + + Parameters + ---------- + df : DataFrame + Must contain `ucgid_str`, `agi_stub` and the three IRS fields + in `_TARGET_COL_MAP` (N1, N2, A00100). + + Returns + ------- + DataFrame with columns: + ucgid_str + breakdown_variable (always "agi_stub") + breakdown_value (1‑10) + target_variable ("agi_tax_unit_count" | "agi_person_count" | "agi_total_amount") + target_value (float) + """ + # — keep only what we need and rename for clarity + work = ( + df[["ucgid_str", _BREAKDOWN_FIELD] + list(_TARGET_COL_MAP)] + .rename(columns=_TARGET_COL_MAP) # N1 → agi_tax_unit_count, etc. + ) + + # — wide → long + long = ( + work.melt( + id_vars=["ucgid_str", _BREAKDOWN_FIELD], + var_name="target_variable", + value_name="target_value" + ) + .rename(columns={_BREAKDOWN_FIELD: "breakdown_value"}) + .assign(breakdown_variable=_BREAKDOWN_NAME) + # Optional: add a human‑readable band label if useful + # .assign(breakdown_label=lambda d: d["breakdown_value"].map(AGI_STUB_TO_BAND)) + ) + + # — final column order + long = long[["ucgid_str", + "breakdown_variable", + "breakdown_value", + "target_variable", + "target_value"]] + + # consistently sort (purely cosmetic) + return ( + long.sort_values(["ucgid_str", "breakdown_value", "target_variable"]) + .reset_index(drop=True) + ) + + def extract_soi_data() -> pd.DataFrame: """Download and save congressional district AGI totals. @@ -258,6 +348,32 @@ def extract_soi_data() -> pd.DataFrame: raw_df = extract_soi_data() # a "stub" is a term the IRS uses for a predefined category or group, specifically an income bracket. +TARGETS = [ + dict(code="59661", name="eitc", breakdown=("eitc_children", 0)), + dict(code="59662", name="eitc", breakdown=("eitc_children", 1)), + dict(code="59663", name="eitc", breakdown=("eitc_children", 2)), + dict(code="59664", name="eitc", breakdown=("eitc_children", "3+")), + dict(code="59664", name="qbid", breakdown=None), + dict(code="18500", name="real_estate_taxes", breakdown=None), + dict(code="01000", name="net_capital_gain", breakdown=None), + dict(code="03150", name="ira_payments", breakdown=None), + dict(code="00300", name="taxable_interest", breakdown=None), + dict(code="00400", name="tax_exempt_interest", breakdown=None), + dict(code="00600", name="oridinary_dividends", breakdown=None), + dict(code="00650", name="qualified_dividends", breakdown=None), + dict(code="26270", name="partnership_and_s_crop_net_income", breakdown=None), + dict(code="02500", name="total_social_security", breakdown=None), + dict(code="01700", name="pension_and_annuities", breakdown=None), + dict(code="02300", name="unemployment_compensation", breakdown=None), + dict(code="00900", name="business_net_income", breakdown=None), + dict(code="17000", name="medical_and_dental_deduction", breakdown=None), + dict(code="00700", name="salt_refunds", breakdown=None), + dict(code="18425", name="salt_amount", breakdown=None), + dict(code="06500", name="income_tax", breakdown=None), +] + + + def transform_soi_data(raw_df) @@ -317,104 +433,53 @@ def transform_soi_data(raw_df) # Collect targets from the SOI file records = [] - - # EITC --------------------------------------------------------------------------- - eitc_no_children = all_marginals.copy().rename({ - 'N59661': 'tax_unit_count', - 'A59661': 'eitc' - }, axis = 1) - eitc_no_children['eitc_children'] = 0 - - records.append( - create_records(eitc_no_children, "eitc_children", "tax_unit_count") - ) - records.append( - create_records(eitc_no_children, "eitc_children", "eitc") - ) - - eitc_one_child = all_marginals.copy().rename({ - 'N59662': 'tax_unit_count', - 'A59662': 'eitc' - }, axis=1) - eitc_one_child['eitc_children'] = 1 - - records.append( - create_records(eitc_one_child, "eitc_children", "tax_unit_count") - ) - records.append( - create_records(eitc_one_child, "eitc_children", "eitc") - ) - - eitc_two_children = all_marginals.copy().rename({ - 'N59663': 'tax_unit_count', - 'A59663': 'eitc' - }, axis=1) - eitc_two_children['eitc_children'] = 2 - - records.append( - create_records(eitc_two_children, "eitc_children", "tax_unit_count") - ) - records.append( - create_records(eitc_two_children, "eitc_children", "eitc") - ) - - eitc_three_plus_children = all_marginals.copy().rename({ - 'N59664': 'tax_unit_count', - 'A59664': 'eitc' - }, axis=1) - eitc_three_plus_children['eitc_children'] = '3+' - - records.append( - create_records(eitc_three_plus_children, "eitc_children", "tax_unit_count") - ) - records.append( - create_records(eitc_three_plus_children, "eitc_children", "eitc") - ) - - # QBID ---------------------------------------------------------------------- - qbid = all_marginals.copy().rename({ - 'N59664': 'tax_unit_count', - 'A59664': 'qbid' - }, axis=1) - # No breakdown variable other than the geo here - qbid['one'] = 1 - - records.append( - create_records(qbid, "one", "tax_unit_count") - ) - records.append( - create_records(qbid, "one", "qbid") - ) - - # SALT ----------------------------------------------------------------------- - - # TODO: THERE's definitely a pattern here - # TODO: you forgot to multiply by 1000! - # For all the files, the money amounts are reported in thousands of dollars. - salt = all_marginals.copy().rename({ - 'N18425': 'tax_unit_count', - 'A18425': 'salt' - }, axis=1) - salt['one'] = 1 - - records.append( - create_records(salt, "one", "tax_unit_count") - ) - records.append( - create_records(qbid, "one", "salt") - ) - - - return records - + for spec in TARGETS: + count_col = f"N{spec['code']}" # e.g. 'N59661' + amount_col = f"A{spec['code']}" # e.g. 'A59661' + + df = all_marginals.copy() + + if spec["breakdown"] is not None: + col, val = spec["breakdown"] + df[col] = val + breakdown_col = col + else: + breakdown_col = None + + rec_counts, rec_amounts = make_records( + df, + count_col = count_col, + amount_col = amount_col, + amount_name = spec["name"], + breakdown_col = breakdown_col, + multiplier = 1_000, + ) + records.extend([rec_counts, rec_amounts]) + # Custom AGI amount, which doesn't have a count column (it has N1 and N2) + temp_df = df[["ucgid_str"]].copy() + temp_df["breakdown_variable"] = "one" + temp_df["breakdown_value"] = 1 + temp_df["target_variable"] = "agi" + temp_df["target_value"] = df["A00100"] * 1_000 + records.append(temp_df) + # It's notable that the national counts only have agi_stub = 0 + all_agi_splits = all_df.copy().loc[all_df.agi_stub != 0] + assert all_agi_splits.shape[0] % (436 + 51 + 0) == 0 + # Still a bit of work to do at the time of loading, since the breakdown variable + # is agi_stub + agi_long = make_agi_long(all_agi_splits) + # We have the distribution and the total amount, let's not go crazy here + agi_long = agi_long.loc[agi_long.target_variable != "agi_total_amount"] + records.append(agi_long) + return pd.concat(records) def _get_soi_data(geo_level: str) -> pd.DataFrame: From 57d98501e9502ac94e41cb9c6aebc31d730360b4 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 8 Aug 2025 17:00:03 -0400 Subject: [PATCH 07/27] medicaid etl file --- policyengine_us_data/db/etl_medicaid.py | 206 ++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 policyengine_us_data/db/etl_medicaid.py diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py new file mode 100644 index 00000000..71786fa0 --- /dev/null +++ b/policyengine_us_data/db/etl_medicaid.py @@ -0,0 +1,206 @@ +import requests +import pandas as pd + + + +# This is from another file +#def extract_docs(year=2023): +# docs_url = ( +# f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json" +# ) +# +# try: +# docs_response = requests.get(docs_url) +# docs_response.raise_for_status() +# +# docs = docs_response.json() +# docs["year"] = year +# +# except requests.exceptions.RequestException as e: +# print(f"Error during API request: {e}") +# raise +# except Exception as e: +# print(f"An error occurred: {e}") +# raise +# return docs + + + +# State abbreviation to FIPS code mapping +state_fips_map = { + 'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06', + 'CO': '08', 'CT': '09', 'DE': '10', 'FL': '12', 'GA': '13', + 'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19', + 'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24', + 'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28', 'MO': '29', + 'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34', + 'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39', + 'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45', + 'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50', + 'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55', 'WY': '56', + 'DC': '11' +} + + + +# I can get data from: + + "S2704_C02_006E": { + "label": "Estimate!!Public Coverage!!COVERAGE ALONE OR IN COMBINATION!!Medicaid/means-tested public coverage alone or in combination", + "concept": "Public Health Insurance Coverage by Type and Selected Characteristics", + "predicateType": "int", + "group": "S2704", + "limit": 0, + "attributes": "S2704_C02_006EA,S2704_C02_006M,S2704_C02_006MA" + }, + + +def extract_medicaid_data(): + year = 2023 + base_url = ( + f"https://api.census.gov/data/{year}/acs/acs1/subject?get=group(S2704)" + ) + url = f"{base_url}&for=congressional+district:*" + response = requests.get(url) + response.raise_for_status() + + data = response.json() + + headers = data[0] + data_rows = data[1:] + cd_survey_df = pd.DataFrame(data_rows, columns=headers) + + item = "6165f45b-ca93-5bb5-9d06-db29c692a360" + response = requests.get( + f"https://data.medicaid.gov/api/1/metastore/schemas/dataset/items/{item}?show-reference-ids=false" + ) + metadata = response.json() + + data_url = metadata['distribution'][0]['data']['downloadURL'] + state_admin_df = pd.read_csv(data_url) + + return cd_survey_df, state_admin_df + + +cd_survey_df, state_admin_df = extract_medicaid_data() + +def transform_medicaid_data(state_admin_df, cd_survey_df): + state_df = state_admin_df.loc[ + (state_admin_df["Reporting Period"] == 202312) & + (state_admin_df["Final Report"] == "Y"), + ["State Abbreviation", "Reporting Period", "Total Medicaid Enrollment"] + ] + + state_df["FIPS"] = state_df["State Abbreviation"].map(state_fips_map) + + cd_df = cd_survey_df[["GEO_ID", "state", "congressional district", "S2704_C02_006E"]] + + nc_cd_sum = cd_df.loc[cd_df.state == "37"].S2704_C02_006E.astype(int).sum() + nc_state_sum = state_df.loc[state_df.FIPS == '37']['Total Medicaid Enrollment'].values[0] + assert nc_cd_sum > .5 * nc_state_sum + assert nc_cd_sum <= nc_state_sum + + return long_df + +# YOU KNOW WHAT TO DO! + +def load_medicaid_data(): + + pass + + + + + + + + + + + + + + + + + +def _geo_clause_for(geo: str) -> str: + if geo == "National": + return "for=us:*" + if geo == "State": + return "for=state:*" + if geo == "District": + # Congressional districts; adding 'in=state:*' avoids API ambiguities + return "for=congressional+district:*&in=state:*" + raise ValueError("geo must be 'National', 'State', or 'District'") + + +def _group_meta(year: int, dataset: str, group: str) -> dict: + url = f"https://api.census.gov/data/{year}/{dataset}/groups/{group}.json" + r = requests.get(url, timeout=60) + r.raise_for_status() + return r.json() + + +def extract_medicaid_s2701(geo: str, year: int = 2023, + which: str = "estimate", + by_age: bool = True) -> pd.DataFrame: + """ + Pulls ACS S2701 'With Medicaid/means-tested public coverage' for the requested geography. + which: 'estimate' (counts) or 'percent' + by_age: True -> Under 19, 19-64, 65+ ; False -> all ages combined + Returns: tidy DataFrame with readable columns plus geo identifiers. + """ + dataset = "acs/acs1/subject" + group = "S2701" + meta = _group_meta(year, dataset, group)["variables"] + + target_prefix = "Estimate" if which == "estimate" else "Percent" + selected, rename = [], {} + + for vid, v in meta.items(): + pass + + if not vid.endswith("E"): # just the estimates + continue + label = v["label"] + if not label.startswith(target_prefix): + continue + ## Keep 'With Medicaid/means-tested public coverage' + #if "COVERAGE TYPE!!With Medicaid/means-tested public coverage" not in label: + # continue + + has_age = "!!AGE!!" in label + if by_age and not has_age: + continue + if not by_age and has_age: + continue + + selected.append(vid) + nice = label.split("!!")[-1] if by_age else "All ages" + rename[vid] = f"Medicaid ({nice}) - {which}" + + if not selected: + raise RuntimeError("No S2701 Medicaid variables matched. Check 'which' or 'by_age' options.") + + get_vars = ["NAME"] + selected + url = f"https://api.census.gov/data/{year}/{dataset}?get={','.join(get_vars)}&{_geo_clause_for(geo)}" + r = requests.get(url, timeout=120) + r.raise_for_status() + raw = r.json() + + df = pd.DataFrame(raw[1:], columns=raw[0]) + for vid in selected: + df[vid] = pd.to_numeric(df[vid], errors="coerce") + df = df.rename(columns=rename) + + # Reorder: geo columns first, then NAME, then our measures + geo_cols = [c for c in ["us", "state", "congressional district"] if c in df.columns] + measure_cols = [rename[v] for v in selected] + return df[geo_cols + ["NAME"] + measure_cols] + + +df = extract_medicaid_s2701("District", + 2023, + "estimate", + False) From 9c4838e5a99fbc43ac89d7ff6d2a1f3986b4ef45 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Sun, 10 Aug 2025 22:55:06 -0400 Subject: [PATCH 08/27] medicaid is loading in --- policyengine_us_data/db/etl_medicaid.py | 242 +++++++++++------------- 1 file changed, 114 insertions(+), 128 deletions(-) diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py index 71786fa0..395bc109 100644 --- a/policyengine_us_data/db/etl_medicaid.py +++ b/policyengine_us_data/db/etl_medicaid.py @@ -2,28 +2,14 @@ import pandas as pd +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker -# This is from another file -#def extract_docs(year=2023): -# docs_url = ( -# f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json" -# ) -# -# try: -# docs_response = requests.get(docs_url) -# docs_response.raise_for_status() -# -# docs = docs_response.json() -# docs["year"] = year -# -# except requests.exceptions.RequestException as e: -# print(f"Error during API request: {e}") -# raise -# except Exception as e: -# print(f"An error occurred: {e}") -# raise -# return docs - +from policyengine_us_data.db.create_database_tables import ( + Stratum, + StratumConstraint, + Target, +) # State abbreviation to FIPS code mapping @@ -42,17 +28,14 @@ } - -# I can get data from: - - "S2704_C02_006E": { - "label": "Estimate!!Public Coverage!!COVERAGE ALONE OR IN COMBINATION!!Medicaid/means-tested public coverage alone or in combination", - "concept": "Public Health Insurance Coverage by Type and Selected Characteristics", - "predicateType": "int", - "group": "S2704", - "limit": 0, - "attributes": "S2704_C02_006EA,S2704_C02_006M,S2704_C02_006MA" - }, +#"S2704_C02_006E": { +# "label": "Estimate!!Public Coverage!!COVERAGE ALONE OR IN COMBINATION!!Medicaid/means-tested public coverage alone or in combination", +# "concept": "Public Health Insurance Coverage by Type and Selected Characteristics", +# "predicateType": "int", +# "group": "S2704", +# "limit": 0, +# "attributes": "S2704_C02_006EA,S2704_C02_006M,S2704_C02_006MA" +#}, def extract_medicaid_data(): @@ -82,8 +65,6 @@ def extract_medicaid_data(): return cd_survey_df, state_admin_df -cd_survey_df, state_admin_df = extract_medicaid_data() - def transform_medicaid_data(state_admin_df, cd_survey_df): state_df = state_admin_df.loc[ (state_admin_df["Reporting Period"] == 202312) & @@ -100,107 +81,112 @@ def transform_medicaid_data(state_admin_df, cd_survey_df): assert nc_cd_sum > .5 * nc_state_sum assert nc_cd_sum <= nc_state_sum - return long_df - -# YOU KNOW WHAT TO DO! - -def load_medicaid_data(): - - pass - - - - - - - - - + state_df = state_df.rename(columns={'Total Medicaid Enrollment': 'medicaid_enrollment'}) + state_df['ucgid_str'] = '0400000US' + state_df['FIPS'].astype(str) + cd_df = cd_df.rename(columns={'S2704_C02_006E': 'medicaid_enrollment', 'GEO_ID': 'ucgid_str'}) + cd_df = cd_df.loc[cd_df.state != '72'] + out_cols = ['ucgid_str', 'medicaid_enrollment'] + return state_df[out_cols], cd_df[out_cols] +def load_medicaid_data(long_state, long_cd): + DATABASE_URL = "sqlite:///policy_data.db" + engine = create_engine(DATABASE_URL) + Session = sessionmaker(bind=engine) + session = Session() + stratum_lookup = {} -def _geo_clause_for(geo: str) -> str: - if geo == "National": - return "for=us:*" - if geo == "State": - return "for=state:*" - if geo == "District": - # Congressional districts; adding 'in=state:*' avoids API ambiguities - return "for=congressional+district:*&in=state:*" - raise ValueError("geo must be 'National', 'State', or 'District'") - - -def _group_meta(year: int, dataset: str, group: str) -> dict: - url = f"https://api.census.gov/data/{year}/{dataset}/groups/{group}.json" - r = requests.get(url, timeout=60) - r.raise_for_status() - return r.json() - - -def extract_medicaid_s2701(geo: str, year: int = 2023, - which: str = "estimate", - by_age: bool = True) -> pd.DataFrame: - """ - Pulls ACS S2701 'With Medicaid/means-tested public coverage' for the requested geography. - which: 'estimate' (counts) or 'percent' - by_age: True -> Under 19, 19-64, 65+ ; False -> all ages combined - Returns: tidy DataFrame with readable columns plus geo identifiers. - """ - dataset = "acs/acs1/subject" - group = "S2701" - meta = _group_meta(year, dataset, group)["variables"] - - target_prefix = "Estimate" if which == "estimate" else "Percent" - selected, rename = [], {} - - for vid, v in meta.items(): - pass - - if not vid.endswith("E"): # just the estimates - continue - label = v["label"] - if not label.startswith(target_prefix): - continue - ## Keep 'With Medicaid/means-tested public coverage' - #if "COVERAGE TYPE!!With Medicaid/means-tested public coverage" not in label: - # continue - - has_age = "!!AGE!!" in label - if by_age and not has_age: - continue - if not by_age and has_age: - continue - - selected.append(vid) - nice = label.split("!!")[-1] if by_age else "All ages" - rename[vid] = f"Medicaid ({nice}) - {which}" - - if not selected: - raise RuntimeError("No S2701 Medicaid variables matched. Check 'which' or 'by_age' options.") - - get_vars = ["NAME"] + selected - url = f"https://api.census.gov/data/{year}/{dataset}?get={','.join(get_vars)}&{_geo_clause_for(geo)}" - r = requests.get(url, timeout=120) - r.raise_for_status() - raw = r.json() - - df = pd.DataFrame(raw[1:], columns=raw[0]) - for vid in selected: - df[vid] = pd.to_numeric(df[vid], errors="coerce") - df = df.rename(columns=rename) - - # Reorder: geo columns first, then NAME, then our measures - geo_cols = [c for c in ["us", "state", "congressional district"] if c in df.columns] - measure_cols = [rename[v] for v in selected] - return df[geo_cols + ["NAME"] + measure_cols] + # Wow, the first time we're making these geos with no breakdown variable + # National ---------------- + nat_stratum = Stratum( + parent_stratum_id=None, stratum_group_id=0, notes="Geo: 0100000US" + ) + nat_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value="0100000US", + ) + ] -df = extract_medicaid_s2701("District", - 2023, - "estimate", - False) + session.add(nat_stratum) + session.flush() + stratum_lookup["National"] = nat_stratum.stratum_id + + # State ------------------- + stratum_lookup["State"] = {} + for _, row in long_state.iterrows(): + + note = f"Geo: {row['ucgid_str']}" + parent_stratum_id = nat_stratum.stratum_id + + new_stratum = Stratum( + parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note + ) + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=row["ucgid_str"], + ), + ] + new_stratum.targets_rel.append( + Target( + variable="medicaid_enrollment", + period=2023, + value=row["medicaid_enrollment"], + source_id=2, + active=True, + ) + ) + session.add(new_stratum) + session.flush() + stratum_lookup["State"][row['ucgid_str']] = new_stratum.stratum_id + + + # District ------------------- + stratum_lookup["District"] = {} + for _, row in long_cd.iterrows(): + + note = f"Geo: {row['ucgid_str']}" + parent_stratum_id = stratum_lookup["State"][f'0400000US{row["ucgid_str"][-4:-2]}'] + + new_stratum = Stratum( + parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note + ) + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=row["ucgid_str"], + ), + ] + new_stratum.targets_rel.append( + Target( + variable="medicaid_enrollment", + period=2023, + value=row["medicaid_enrollment"], + source_id=2, + active=True, + ) + ) + session.add(new_stratum) + session.flush() + + + session.commit() + + return stratum_lookup + +if __name__ == "__main__": + cd_survey_df, state_admin_df = extract_medicaid_data() + + long_state, long_cd = transform_medicaid_data(state_admin_df, cd_survey_df) + + load_medicaid_data(long_state, long_cd) From 57716f2088b470c06475273b757b2890690b65a9 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 11 Aug 2025 23:18:08 -0400 Subject: [PATCH 09/27] medicaid and some SNAP data --- policyengine_us_data/db/etl_medicaid.py | 79 +++-- policyengine_us_data/db/etl_snap.py | 438 ++++++++++++++++++++++++ policyengine_us_data/utils/census.py | 42 +++ 3 files changed, 522 insertions(+), 37 deletions(-) create mode 100644 policyengine_us_data/db/etl_snap.py create mode 100644 policyengine_us_data/utils/census.py diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py index 395bc109..d1babe31 100644 --- a/policyengine_us_data/db/etl_medicaid.py +++ b/policyengine_us_data/db/etl_medicaid.py @@ -1,7 +1,6 @@ import requests -import pandas as pd - +import pandas as pd from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker @@ -28,18 +27,7 @@ } -#"S2704_C02_006E": { -# "label": "Estimate!!Public Coverage!!COVERAGE ALONE OR IN COMBINATION!!Medicaid/means-tested public coverage alone or in combination", -# "concept": "Public Health Insurance Coverage by Type and Selected Characteristics", -# "predicateType": "int", -# "group": "S2704", -# "limit": 0, -# "attributes": "S2704_C02_006EA,S2704_C02_006M,S2704_C02_006MA" -#}, - - -def extract_medicaid_data(): - year = 2023 +def extract_medicaid_data(year): base_url = ( f"https://api.census.gov/data/{year}/acs/acs1/subject?get=group(S2704)" ) @@ -65,9 +53,12 @@ def extract_medicaid_data(): return cd_survey_df, state_admin_df -def transform_medicaid_data(state_admin_df, cd_survey_df): +def transform_medicaid_data(state_admin_df, cd_survey_df, year): + + reporting_period = year * 100 + 12 + print(f"Reporting period is {reporting_period}") state_df = state_admin_df.loc[ - (state_admin_df["Reporting Period"] == 202312) & + (state_admin_df["Reporting Period"] == reporting_period) & (state_admin_df["Final Report"] == "Y"), ["State Abbreviation", "Reporting Period", "Total Medicaid Enrollment"] ] @@ -91,29 +82,34 @@ def transform_medicaid_data(state_admin_df, cd_survey_df): return state_df[out_cols], cd_df[out_cols] -def load_medicaid_data(long_state, long_cd): +def load_medicaid_data(long_state, long_cd, year): DATABASE_URL = "sqlite:///policy_data.db" engine = create_engine(DATABASE_URL) + year = 2023 Session = sessionmaker(bind=engine) session = Session() stratum_lookup = {} - # Wow, the first time we're making these geos with no breakdown variable - # National ---------------- nat_stratum = Stratum( - parent_stratum_id=None, stratum_group_id=0, notes="Geo: 0100000US" + parent_stratum_id=None, stratum_group_id=0, notes="Geo: 0100000US Medicaid Enrolled" ) nat_stratum.constraints_rel = [ StratumConstraint( constraint_variable="ucgid_str", - operation="in", + operation="equals", value="0100000US", - ) + ), + StratumConstraint( + constraint_variable="medicaid_enrolled", + operation="equals", + value="True", + ), ] + # No target at the national level is provided at this time. session.add(nat_stratum) session.flush() @@ -123,7 +119,7 @@ def load_medicaid_data(long_state, long_cd): stratum_lookup["State"] = {} for _, row in long_state.iterrows(): - note = f"Geo: {row['ucgid_str']}" + note = f"Geo: {row['ucgid_str']} Medicaid Enrolled" parent_stratum_id = nat_stratum.stratum_id new_stratum = Stratum( @@ -132,14 +128,19 @@ def load_medicaid_data(long_state, long_cd): new_stratum.constraints_rel = [ StratumConstraint( constraint_variable="ucgid_str", - operation="in", + operation="equals", value=row["ucgid_str"], ), + StratumConstraint( + constraint_variable="medicaid_enrolled", + operation="equals", + value="True", + ), ] new_stratum.targets_rel.append( Target( - variable="medicaid_enrollment", - period=2023, + variable="person_count", + period=year, value=row["medicaid_enrollment"], source_id=2, active=True, @@ -149,12 +150,10 @@ def load_medicaid_data(long_state, long_cd): session.flush() stratum_lookup["State"][row['ucgid_str']] = new_stratum.stratum_id - # District ------------------- - stratum_lookup["District"] = {} for _, row in long_cd.iterrows(): - note = f"Geo: {row['ucgid_str']}" + note = f"Geo: {row['ucgid_str']} Medicaid Enrolled" parent_stratum_id = stratum_lookup["State"][f'0400000US{row["ucgid_str"][-4:-2]}'] new_stratum = Stratum( @@ -163,14 +162,19 @@ def load_medicaid_data(long_state, long_cd): new_stratum.constraints_rel = [ StratumConstraint( constraint_variable="ucgid_str", - operation="in", + operation="equals", value=row["ucgid_str"], ), + StratumConstraint( + constraint_variable="medicaid_enrolled", + operation="equals", + value="True", + ), ] new_stratum.targets_rel.append( Target( - variable="medicaid_enrollment", - period=2023, + variable="person_count", + period=year, value=row["medicaid_enrollment"], source_id=2, active=True, @@ -179,14 +183,15 @@ def load_medicaid_data(long_state, long_cd): session.add(new_stratum) session.flush() - session.commit() - return stratum_lookup if __name__ == "__main__": - cd_survey_df, state_admin_df = extract_medicaid_data() - long_state, long_cd = transform_medicaid_data(state_admin_df, cd_survey_df) + year = 2023 + + cd_survey_df, state_admin_df = extract_medicaid_data(year) + + long_state, long_cd = transform_medicaid_data(state_admin_df, cd_survey_df, year) - load_medicaid_data(long_state, long_cd) + load_medicaid_data(long_state, long_cd, year) diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py new file mode 100644 index 00000000..a82da744 --- /dev/null +++ b/policyengine_us_data/db/etl_snap.py @@ -0,0 +1,438 @@ +import requests +import zipfile +import io +import os +import re +from pathlib import Path + +import pandas as pd +import numpy as np +import us +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from policyengine_us_data.db.create_database_tables import ( + Stratum, + StratumConstraint, + Target, +) +from policyengine_us_data.utils.census import ( + get_census_docs, + pull_subject_table, +) + + +STATE_NAME_TO_FIPS = { + "Alabama": "01", + "Alaska": "02", + "Arizona": "04", + "Arkansas": "05", + "California": "06", + "Colorado": "08", + "Connecticut": "09", + "Delaware": "10", + "District of Columbia": "11", + "Florida": "12", + "Georgia": "13", + "Hawaii": "15", + "Idaho": "16", + "Illinois": "17", + "Indiana": "18", + "Iowa": "19", + "Kansas": "20", + "Kentucky": "21", + "Louisiana": "22", + "Maine": "23", + "Maryland": "24", + "Massachusetts": "25", + "Michigan": "26", + "Minnesota": "27", + "Mississippi": "28", + "Missouri": "29", + "Montana": "30", + "Nebraska": "31", + "Nevada": "32", + "New Hampshire": "33", + "New Jersey": "34", + "New Mexico": "35", + "New York": "36", + "North Carolina": "37", + "North Dakota": "38", + "Ohio": "39", + "Oklahoma": "40", + "Oregon": "41", + "Pennsylvania": "42", + "Rhode Island": "44", + "South Carolina": "45", + "South Dakota": "46", + "Tennessee": "47", + "Texas": "48", + "Utah": "49", + "Vermont": "50", + "Virginia": "51", + "Washington": "53", + "West Virginia": "54", + "Wisconsin": "55", + "Wyoming": "56", +} + + +def extract_administrative_snap_data(year=2023): + """ + Downloads and extracts annual state-level SNAP data from the USDA FNS zip file. + """ + url = "https://www.fns.usda.gov/sites/default/files/resource-files/snap-zip-fy69tocurrent-6.zip" + + # Note: extra complexity in request due to regional restrictions on downloads (e.g., Spain) + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + } + + try: + session = requests.Session() + session.headers.update(headers) + + # Try to visit the main page first to get any necessary cookies + main_page = "https://www.fns.usda.gov/pd/supplemental-nutrition-assistance-program-snap" + try: + session.get(main_page, timeout=30) + except: + pass # Ignore errors on the main page + + response = session.get(url, timeout=30, allow_redirects=True) + response.raise_for_status() + except requests.exceptions.RequestException as e: + print(f"Error downloading file: {e}") + # Try alternative URL or method + try: + alt_url = "https://www.fns.usda.gov/sites/default/files/resource-files/snap-zip-fy69tocurrent-6.zip" + response = session.get(alt_url, timeout=30, allow_redirects=True) + response.raise_for_status() + except requests.exceptions.RequestException as e2: + print(f"Alternative URL also failed: {e2}") + return None + + return zipfile.ZipFile(io.BytesIO(response.content)) + + +def transform_snap_administrative_data(zip_file, year): + filename = f"FY{str(year)[-2:]}.xlsx" + with zip_file.open(filename) as f: + xls = pd.ExcelFile(f) + tab_results = [] + for sheet_name in [ + "NERO", + "MARO", + "SERO", + "MWRO", + "SWRO", + "MPRO", + "WRO", + ]: + df_raw = pd.read_excel( + xls, sheet_name=sheet_name, header=None, dtype={0: str} + ) + + state_row_mask = ( + df_raw[0].notna() + & df_raw[1].isna() + & ~df_raw[0].str.contains("Total", na=False) + & ~df_raw[0].str.contains("Footnote", na=False) + ) + + df_raw["State"] = df_raw.loc[state_row_mask, 0] + df_raw["State"] = df_raw["State"].ffill() + total_rows = df_raw[df_raw[0].eq("Total")].copy() + total_rows = total_rows.rename( + columns={ + 1: "Households", + 2: "Persons", + 3: "Cost", + } + ) + + state_totals = total_rows[ + [ + "State", + "Households", + "Persons", + "Cost", # Annual (Note: the CostPer* vars are monthly) + ] + ] + + tab_results.append(state_totals) + + results_df = pd.concat(tab_results) + + df_states = results_df.loc[ + results_df["State"].isin(STATE_NAME_TO_FIPS.keys()) + ].copy() + df_states["STATE_FIPS"] = df_states["State"].map(STATE_NAME_TO_FIPS) + df_states = ( + df_states.loc[~df_states["STATE_FIPS"].isna()] + .sort_values("STATE_FIPS") + .reset_index(drop=True) + ) + df_states["ucgid_str"] = "0400000US" + df_states["STATE_FIPS"] + + # I don't think I need to make this long, because it's going to be 3 different variables + #df_states[['ucgid_str', 'Households']] + #df_states[['ucgid_str', 'Persons']] + #df_states[['ucgid_str', 'Cost']] + + return df_states + + +def load_snap_administrative_data(?, year): + + year = 2023 + + DATABASE_URL = "sqlite:///policy_data.db" + engine = create_engine(DATABASE_URL) + + Session = sessionmaker(bind=engine) + session = Session() + + stratum_lookup = {} + + # National ---------------- + nat_stratum = Stratum( + parent_stratum_id=None, stratum_group_id=0, notes="Geo: 0100000US Received SNAP Benefits" + ) + nat_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="equals", + value="0100000US", + ), + StratumConstraint( + constraint_variable="snap", + operation="is_greater_than", + value="0", + ), + ] + # No target at the national level is provided at this time. + + session.add(nat_stratum) + session.flush() + stratum_lookup["National"] = nat_stratum.stratum_id + + # State ------------------- + stratum_lookup["State"] = {} + for _, row in df_states.iterrows(): + + note = f"Geo: {row['ucgid_str']} Received SNAP Benefits" + parent_stratum_id = nat_stratum.stratum_id + + new_stratum = Stratum( + parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note + ) + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="equals", + value=row["ucgid_str"], + ), + StratumConstraint( + constraint_variable="snap", + operation="is_greater_than", + value="0", + ), + ] + # Two targets now. Same data source. Same stratum + new_stratum.targets_rel.append( + Target( + variable="household_count", + period=year, + value=row["Households"], + source_id=3, + active=True, + ) + ) + new_stratum.targets_rel.append( + Target( + variable="snap", + period=year, + value=row["Cost"], + source_id=3, + active=True, + ) + ) + session.add(new_stratum) + session.flush() + stratum_lookup["State"][row['ucgid_str']] = new_stratum.stratum_id + + session.commit() + + + +# Moving away from administrative data to get the survey data ------ + + + +def extract_survey_snap_data(year): + + # Household count data ----- + data = pull_acs_table("S2201", "National", 2023) + data["S2201_C03_001E"] + + + # Ha, this is off my a factor of 1000, and ACS does not report dollars in 1000s + # TODO: try to figure it out. + data = pull_acs_table("B19058", "State", 2023) + np.sum(data["B19058_001E"].values.astype(int)) / 1E9 + + + + raw_dfs = {} + for geo in ["District", "State", "National"]: + df = pull_subject_table(group, geo, year) + df_data = df.rename(columns=rename_mapping)[ + ["GEO_ID", "NAME"] + list(label_to_short_name_mapping.values()) + ] + if geo == "State": + raw_dfs["DC"] = df_data[df_data["GEO_ID"].isin(["0400000US11"])] + + # Filter out Puerto Rico + df_geos = df_data[ + ~df_data["GEO_ID"].isin( + [ + "5001800US7298", + "0400000US72", + ] + ) + ].copy() + raw_dfs[geo] = df_geos + SAVE_DIR = Path(get_data_directory() / "input" / "demographics") + df_geos.to_csv(SAVE_DIR / f"raw_snap_{geo}.csv", index=False) + + folder_path = ( + f"{get_data_directory()}/targets/edition=raw/" + f"base_period={year}/reference_period={year}/" + f"variable=snap_households/" + ) + raw_out = pd.concat([ + raw_dfs['National'][['GEO_ID', 'overall']], + raw_dfs['State'][['GEO_ID', 'overall']], + raw_dfs['DC'][['GEO_ID', 'overall']], + raw_dfs['District'][['GEO_ID', 'overall']] + ]).rename({"GEO_ID": "geography_id", "overall": "value"}, axis=1) + + raw_out.to_csv(os.path.join(folder_path, "part-001.csv"), index=False) + + additive_dfs = enforce_geographic_self_consistency(raw_dfs, 'overall') + usda_snap_df = extract_usda_snap_data() + adjusted_dfs = adjust_to_administrative_data(additive_dfs, 'overall', usda_snap_df) + assert check_geographic_consistency(adjusted_dfs, 'overall') + + folder_path = ( + f"{get_data_directory()}/targets/edition=cleaned/" + f"base_period={year}/reference_period={year}/" + f"variable=snap_households/" + ) + + clean_out = pd.concat([ + adjusted_dfs['National'][['GEO_ID', 'overall']], + adjusted_dfs['State'][['GEO_ID', 'overall']], + adjusted_dfs['DC'][['GEO_ID', 'overall']], + adjusted_dfs['District'][['GEO_ID', 'overall']] + ]).rename({"GEO_ID": "geography_id", "overall": "value"}, axis=1) + + clean_out.to_csv(os.path.join(folder_path, "part-001.csv"), index=False) + + +def reformat_cleaned_data(): + """Temporary conversion function""" + benefits_dir = Path(get_data_directory() / 'input' / 'benefits') + + snap_filepath = Path( + get_data_directory(), + "targets", + "edition=cleaned", + "base_period=2023", + "reference_period=2023", + "variable=snap_households", + "part-001.csv" + ) + snap_data = pd.read_csv(snap_filepath) + geo_hierarchies = pd.read_csv(Path(get_data_directory(), 'meta', 'geo_hierarchies.csv')) + + # Use Type II SCD to Filter geo_hierarchies for the year 2023 + geo_hierarchies['start_date'] = pd.to_datetime(geo_hierarchies['start_date']) + geo_hierarchies['end_date'] = pd.to_datetime(geo_hierarchies['end_date']) + geo_hierarchies_2023 = geo_hierarchies[ + (geo_hierarchies['start_date'] <= '2023-01-01') & + (geo_hierarchies['end_date'] >= '2023-01-01') + ] + + merged_data = pd.merge(snap_data, geo_hierarchies_2023, left_on='geography_id', right_on='geography_id') + + def create_cleaned_df(data, geo_name_map=None, geo_name_prefix=''): + df = pd.DataFrame() + df['GEO_ID'] = data['geography_id'] + if geo_name_map: + df['GEO_NAME'] = data['geography_id'].map(geo_name_map) + elif 'geography_name' in data.columns: + df['GEO_NAME'] = data['geography_name'] + else: + df['GEO_NAME'] = '' + + df['AGI_LOWER_BOUND'] = '' + df['AGI_UPPER_BOUND'] = '' + df['VALUE'] = data['value'] + df['IS_COUNT'] = 1 + df['VARIABLE'] = 'snap_households' + return df + + # National data + national_data = merged_data[merged_data['geography_type'] == 'nation'].copy() + national_data['geography_name'] = 'US' + cleaned_national = create_cleaned_df(national_data) + cleaned_national.to_csv(Path(get_data_directory(), 'input', 'benefits', 'cleaned_snap_national.csv'), index=False) + + # State data + state_data = merged_data[merged_data['geography_type'] == 'state-equivalent'].copy() + # TODO: fix this redundancy if this becomes permanenent + state_fips_map = { + '01': 'AL', '02': 'AK', '04': 'AZ', '05': 'AR', '06': 'CA', '08': 'CO', '09': 'CT', '10': 'DE', '11': 'DC', + '12': 'FL', '13': 'GA', '15': 'HI', '16': 'ID', '17': 'IL', '18': 'IN', '19': 'IA', '20': 'KS', '21': 'KY', + '22': 'LA', '23': 'ME', '24': 'MD', '25': 'MA', '26': 'MI', '27': 'MN', '28': 'MS', '29': 'MO', '30': 'MT', + '31': 'NE', '32': 'NV', '33': 'NH', '34': 'NJ', '35': 'NM', '36': 'NY', '37': 'NC', '38': 'ND', '39': 'OH', + '40': 'OK', '41': 'OR', '42': 'PA', '44': 'RI', '45': 'SC', '46': 'SD', '47': 'TN', '48': 'TX', '49': 'UT', + '50': 'VT', '51': 'VA', '53': 'WA', '54': 'WV', '55': 'WI', '56': 'WY' + } + state_data['state_fips'] = state_data['geography_id'].str[-2:] + state_data['geography_name'] = state_data['state_fips'].map(state_fips_map) + cleaned_state = create_cleaned_df(state_data) + cleaned_state.to_csv(Path('us-congressional-districts/data/input/benefits/cleaned_snap_state.csv', index=False) + cleaned_state.to_csv(Path(get_data_directory(), 'input', 'benefits', 'cleaned_snap_state.csv'), index=False) + + # District data + district_data = merged_data[merged_data['geography_type'] == 'district'].copy() + district_data['state_fips'] = district_data['geography_id'].str[9:11] + district_data['district_num'] = district_data['geography_id'].str[11:] + district_data['geography_name'] = district_data['state_fips'].map(state_fips_map) + ' - District ' + district_data['district_num'] + cleaned_district = create_cleaned_df(district_data) + cleaned_district["VALUE"] = cleaned_district["VALUE"].round().astype(int) + cleaned_district.to_csv(Path(get_data_directory(), 'input', 'benefits', 'cleaned_snap_district.csv'), index=False) + + + +if __name__ == "__main__": + process_snap_data(2023) + + + +def main() -> None: + year = 2023 + + zip_file = extract_snap_data(2023) + + +if __name__ == "__main__": + main() diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py new file mode 100644 index 00000000..a728b5ca --- /dev/null +++ b/policyengine_us_data/utils/census.py @@ -0,0 +1,42 @@ +import pathlib +import requests + +import pandas as pd +import numpy as np + + +def get_census_docs(year): + docs_url = ( + f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json" + ) + # TODO: Alternative: incorporate it! + "https://api.census.gov/data/2023/acs/acs1/variables.json" + + docs_response = requests.get(docs_url) + docs_response.raise_for_status() + + return docs_response.json() + + +def pull_acs_table(group: str, geo: str, year: int) -> pd.DataFrame: + """ + "group": e.g., 'S2201' + "geo": 'National' | 'State' | 'District' + "year": e.g., 2023 + """ + base = f"https://api.census.gov/data/{year}/acs/acs1" + + if group[0] == 'S': + base = base + "/subject" + geo_q = { + "National": "us:*", + "State": "state:*", + "District": "congressional+district:*", + }[geo] + + url = f"{base}?get=group({group})&for={geo_q}" + + data = requests.get(url).json() + headers, rows = data[0], data[1:] + df = pd.DataFrame(rows, columns=headers) + return df From 7b3cacc0186012db08329b89593143aefbdf09c8 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Tue, 12 Aug 2025 13:37:28 -0400 Subject: [PATCH 10/27] got SNAP settled --- ...sury_targets.py => etl_irs_soi_targets.py} | 0 policyengine_us_data/db/etl_snap.py | 272 ++++++++---------- policyengine_us_data/db/load_age_targets.py | 55 ---- policyengine_us_data/utils/census.py | 55 ++++ 4 files changed, 173 insertions(+), 209 deletions(-) rename policyengine_us_data/db/{load_treasury_targets.py => etl_irs_soi_targets.py} (100%) diff --git a/policyengine_us_data/db/load_treasury_targets.py b/policyengine_us_data/db/etl_irs_soi_targets.py similarity index 100% rename from policyengine_us_data/db/load_treasury_targets.py rename to policyengine_us_data/db/etl_irs_soi_targets.py diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index a82da744..a0f20133 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -18,7 +18,8 @@ ) from policyengine_us_data.utils.census import ( get_census_docs, - pull_subject_table, + pull_acs_table, + STATE_NAME_TO_FIPS, ) @@ -76,6 +77,7 @@ "Wyoming": "56", } +# Administrative data ------------------------------------------------ def extract_administrative_snap_data(year=2023): """ @@ -120,7 +122,7 @@ def extract_administrative_snap_data(year=2023): return zipfile.ZipFile(io.BytesIO(response.content)) -def transform_snap_administrative_data(zip_file, year): +def transform_administrative_snap_data(zip_file, year): filename = f"FY{str(year)[-2:]}.xlsx" with zip_file.open(filename) as f: xls = pd.ExcelFile(f) @@ -180,17 +182,10 @@ def transform_snap_administrative_data(zip_file, year): ) df_states["ucgid_str"] = "0400000US" + df_states["STATE_FIPS"] - # I don't think I need to make this long, because it's going to be 3 different variables - #df_states[['ucgid_str', 'Households']] - #df_states[['ucgid_str', 'Persons']] - #df_states[['ucgid_str', 'Cost']] - return df_states -def load_snap_administrative_data(?, year): - - year = 2023 +def load_administrative_snap_data(df_states, year): DATABASE_URL = "sqlite:///policy_data.db" engine = create_engine(DATABASE_URL) @@ -207,7 +202,7 @@ def load_snap_administrative_data(?, year): nat_stratum.constraints_rel = [ StratumConstraint( constraint_variable="ucgid_str", - operation="equals", + operation="in", value="0100000US", ), StratumConstraint( @@ -235,7 +230,7 @@ def load_snap_administrative_data(?, year): new_stratum.constraints_rel = [ StratumConstraint( constraint_variable="ucgid_str", - operation="equals", + operation="in", value=row["ucgid_str"], ), StratumConstraint( @@ -268,170 +263,139 @@ def load_snap_administrative_data(?, year): stratum_lookup["State"][row['ucgid_str']] = new_stratum.stratum_id session.commit() + return stratum_lookup - -# Moving away from administrative data to get the survey data ------ - - +# Survey data ------------------------------------------------------ def extract_survey_snap_data(year): - # Household count data ----- - data = pull_acs_table("S2201", "National", 2023) - data["S2201_C03_001E"] - - - # Ha, this is off my a factor of 1000, and ACS does not report dollars in 1000s - # TODO: try to figure it out. - data = pull_acs_table("B19058", "State", 2023) - np.sum(data["B19058_001E"].values.astype(int)) / 1E9 + raw_dfs = {} + for geo in ["District", "State", "National"]: + df = pull_acs_table("S2201", geo, year) + raw_dfs[geo] = df + return raw_dfs - raw_dfs = {} - for geo in ["District", "State", "National"]: - df = pull_subject_table(group, geo, year) - df_data = df.rename(columns=rename_mapping)[ - ["GEO_ID", "NAME"] + list(label_to_short_name_mapping.values()) - ] - if geo == "State": - raw_dfs["DC"] = df_data[df_data["GEO_ID"].isin(["0400000US11"])] +def transform_survey_snap_data(raw_dfs): - # Filter out Puerto Rico - df_geos = df_data[ - ~df_data["GEO_ID"].isin( - [ - "5001800US7298", + dfs = {} + for geo in raw_dfs.keys(): + df = raw_dfs[geo] + dfs[geo] = df_data = df[["GEO_ID", "S2201_C03_001E"]].rename({ + "GEO_ID": "ucgid_str", + "S2201_C03_001E": "snap_household_ct" + }, axis=1 + )[ + ~df["GEO_ID"].isin( + [ # Puerto Rico's state and district "0400000US72", + "5001800US7298", ] ) ].copy() - raw_dfs[geo] = df_geos - SAVE_DIR = Path(get_data_directory() / "input" / "demographics") - df_geos.to_csv(SAVE_DIR / f"raw_snap_{geo}.csv", index=False) - - folder_path = ( - f"{get_data_directory()}/targets/edition=raw/" - f"base_period={year}/reference_period={year}/" - f"variable=snap_households/" - ) - raw_out = pd.concat([ - raw_dfs['National'][['GEO_ID', 'overall']], - raw_dfs['State'][['GEO_ID', 'overall']], - raw_dfs['DC'][['GEO_ID', 'overall']], - raw_dfs['District'][['GEO_ID', 'overall']] - ]).rename({"GEO_ID": "geography_id", "overall": "value"}, axis=1) - - raw_out.to_csv(os.path.join(folder_path, "part-001.csv"), index=False) - - additive_dfs = enforce_geographic_self_consistency(raw_dfs, 'overall') - usda_snap_df = extract_usda_snap_data() - adjusted_dfs = adjust_to_administrative_data(additive_dfs, 'overall', usda_snap_df) - assert check_geographic_consistency(adjusted_dfs, 'overall') - - folder_path = ( - f"{get_data_directory()}/targets/edition=cleaned/" - f"base_period={year}/reference_period={year}/" - f"variable=snap_households/" - ) - - clean_out = pd.concat([ - adjusted_dfs['National'][['GEO_ID', 'overall']], - adjusted_dfs['State'][['GEO_ID', 'overall']], - adjusted_dfs['DC'][['GEO_ID', 'overall']], - adjusted_dfs['District'][['GEO_ID', 'overall']] - ]).rename({"GEO_ID": "geography_id", "overall": "value"}, axis=1) - - clean_out.to_csv(os.path.join(folder_path, "part-001.csv"), index=False) - - -def reformat_cleaned_data(): - """Temporary conversion function""" - benefits_dir = Path(get_data_directory() / 'input' / 'benefits') - - snap_filepath = Path( - get_data_directory(), - "targets", - "edition=cleaned", - "base_period=2023", - "reference_period=2023", - "variable=snap_households", - "part-001.csv" + + return dfs + + +def load_survey_snap_data(survey_dfs, year, stratum_lookup ={}): + """Use an already defined stratum_lookup to load the survey SNAP data""" + + DATABASE_URL = "sqlite:///policy_data.db" + engine = create_engine(DATABASE_URL) + + Session = sessionmaker(bind=engine) + session = Session() + + # National. Use the stratum from the administrative data function + nat_df = survey_dfs["National"] + nat_stratum = session.get(Stratum, stratum_lookup["National"]) + + nat_stratum.targets_rel.append( + Target( + variable="household_count", + period=year, + value=nat_df["snap_household_ct"], + source_id=4, + active=True, + ) ) - snap_data = pd.read_csv(snap_filepath) - geo_hierarchies = pd.read_csv(Path(get_data_directory(), 'meta', 'geo_hierarchies.csv')) - - # Use Type II SCD to Filter geo_hierarchies for the year 2023 - geo_hierarchies['start_date'] = pd.to_datetime(geo_hierarchies['start_date']) - geo_hierarchies['end_date'] = pd.to_datetime(geo_hierarchies['end_date']) - geo_hierarchies_2023 = geo_hierarchies[ - (geo_hierarchies['start_date'] <= '2023-01-01') & - (geo_hierarchies['end_date'] >= '2023-01-01') - ] - - merged_data = pd.merge(snap_data, geo_hierarchies_2023, left_on='geography_id', right_on='geography_id') - - def create_cleaned_df(data, geo_name_map=None, geo_name_prefix=''): - df = pd.DataFrame() - df['GEO_ID'] = data['geography_id'] - if geo_name_map: - df['GEO_NAME'] = data['geography_id'].map(geo_name_map) - elif 'geography_name' in data.columns: - df['GEO_NAME'] = data['geography_name'] - else: - df['GEO_NAME'] = '' - - df['AGI_LOWER_BOUND'] = '' - df['AGI_UPPER_BOUND'] = '' - df['VALUE'] = data['value'] - df['IS_COUNT'] = 1 - df['VARIABLE'] = 'snap_households' - return df - - # National data - national_data = merged_data[merged_data['geography_type'] == 'nation'].copy() - national_data['geography_name'] = 'US' - cleaned_national = create_cleaned_df(national_data) - cleaned_national.to_csv(Path(get_data_directory(), 'input', 'benefits', 'cleaned_snap_national.csv'), index=False) - - # State data - state_data = merged_data[merged_data['geography_type'] == 'state-equivalent'].copy() - # TODO: fix this redundancy if this becomes permanenent - state_fips_map = { - '01': 'AL', '02': 'AK', '04': 'AZ', '05': 'AR', '06': 'CA', '08': 'CO', '09': 'CT', '10': 'DE', '11': 'DC', - '12': 'FL', '13': 'GA', '15': 'HI', '16': 'ID', '17': 'IL', '18': 'IN', '19': 'IA', '20': 'KS', '21': 'KY', - '22': 'LA', '23': 'ME', '24': 'MD', '25': 'MA', '26': 'MI', '27': 'MN', '28': 'MS', '29': 'MO', '30': 'MT', - '31': 'NE', '32': 'NV', '33': 'NH', '34': 'NJ', '35': 'NM', '36': 'NY', '37': 'NC', '38': 'ND', '39': 'OH', - '40': 'OK', '41': 'OR', '42': 'PA', '44': 'RI', '45': 'SC', '46': 'SD', '47': 'TN', '48': 'TX', '49': 'UT', - '50': 'VT', '51': 'VA', '53': 'WA', '54': 'WV', '55': 'WI', '56': 'WY' - } - state_data['state_fips'] = state_data['geography_id'].str[-2:] - state_data['geography_name'] = state_data['state_fips'].map(state_fips_map) - cleaned_state = create_cleaned_df(state_data) - cleaned_state.to_csv(Path('us-congressional-districts/data/input/benefits/cleaned_snap_state.csv', index=False) - cleaned_state.to_csv(Path(get_data_directory(), 'input', 'benefits', 'cleaned_snap_state.csv'), index=False) - - # District data - district_data = merged_data[merged_data['geography_type'] == 'district'].copy() - district_data['state_fips'] = district_data['geography_id'].str[9:11] - district_data['district_num'] = district_data['geography_id'].str[11:] - district_data['geography_name'] = district_data['state_fips'].map(state_fips_map) + ' - District ' + district_data['district_num'] - cleaned_district = create_cleaned_df(district_data) - cleaned_district["VALUE"] = cleaned_district["VALUE"].round().astype(int) - cleaned_district.to_csv(Path(get_data_directory(), 'input', 'benefits', 'cleaned_snap_district.csv'), index=False) + session.add(nat_stratum) + session.flush() + # Skipping state for now, but + # # State. Also use the stratum from the administrative data function + # state_df = survey_dfs["State"] + # for _, row in state_df.iterrows(): + # print(row) + # state_stratum = session.get(Stratum, stratum_lookup["State"][row["ucgid_str"]]) + + # state_stratum.targets_rel.append( + # Target( + # variable="household_count", + # period=year, + # value=row["snap_household_ct"], + # source_id=4, + # active=True, + # ) + # ) + # session.add(state_stratum) + # session.flush() + + # You will need to create new strata for districts + district_df = survey_dfs["District"] + for _, row in district_df.iterrows(): + note = f"Geo: {row['ucgid_str']} Received SNAP Benefits" + state_ucgid_str = '0400000US' + row['ucgid_str'][9:11] + state_stratum_id = stratum_lookup['State'][state_ucgid_str] + new_stratum = Stratum( + parent_stratum_id=state_stratum_id, stratum_group_id=0, notes=note + ) + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=row["ucgid_str"], + ), + StratumConstraint( + constraint_variable="snap", + operation="greater_than", + value='0', + ), + ] + new_stratum.targets_rel.append( + Target( + variable="household_count", + period=year, + value=row["snap_household_ct"], + source_id=4, + active=True, + ) + ) + session.add(new_stratum) + session.flush() -if __name__ == "__main__": - process_snap_data(2023) + session.commit() + return stratum_lookup -def main() -> None: +def main(): year = 2023 - zip_file = extract_snap_data(2023) + # Extract --------- + zip_file_admin = extract_administrative_snap_data() + raw_survey_dfs = extract_survey_snap_data(year) + + # Transform ------- + state_admin_df = transform_administrative_snap_data(zip_file_admin, year) + survey_dfs = transform_survey_snap_data(raw_survey_dfs) + + # Load ----------- + stratum_lookup = load_administrative_snap_data(state_admin_df, year) + load_survey_snap_data(survey_dfs, year, stratum_lookup) if __name__ == "__main__": diff --git a/policyengine_us_data/db/load_age_targets.py b/policyengine_us_data/db/load_age_targets.py index f42adcf3..f5142e17 100644 --- a/policyengine_us_data/db/load_age_targets.py +++ b/policyengine_us_data/db/load_age_targets.py @@ -17,61 +17,6 @@ logger = logging.getLogger(__name__) -STATE_NAME_TO_ABBREV = { - "Alabama": "AL", - "Alabama": "AL", - "Alaska": "AK", - "Arizona": "AZ", - "Arkansas": "AR", - "California": "CA", - "Colorado": "CO", - "Connecticut": "CT", - "Delaware": "DE", - "District of Columbia": "DC", - "Florida": "FL", - "Georgia": "GA", - "Hawaii": "HI", - "Idaho": "ID", - "Illinois": "IL", - "Indiana": "IN", - "Iowa": "IA", - "Kansas": "KS", - "Kentucky": "KY", - "Louisiana": "LA", - "Maine": "ME", - "Maryland": "MD", - "Massachusetts": "MA", - "Michigan": "MI", - "Minnesota": "MN", - "Mississippi": "MS", - "Missouri": "MO", - "Montana": "MT", - "Nebraska": "NE", - "Nevada": "NV", - "New Hampshire": "NH", - "New Jersey": "NJ", - "New Mexico": "NM", - "New York": "NY", - "North Carolina": "NC", - "North Dakota": "ND", - "Ohio": "OH", - "Oklahoma": "OK", - "Oregon": "OR", - "Pennsylvania": "PA", - "Rhode Island": "RI", - "South Carolina": "SC", - "South Dakota": "SD", - "Tennessee": "TN", - "Texas": "TX", - "Utah": "UT", - "Vermont": "VT", - "Virginia": "VA", - "Washington": "WA", - "West Virginia": "WV", - "Wisconsin": "WI", - "Wyoming": "WY", -} - LABEL_TO_SHORT = { "Estimate!!Total!!Total population!!AGE!!Under 5 years": "0-4", diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py index a728b5ca..69a475fb 100644 --- a/policyengine_us_data/utils/census.py +++ b/policyengine_us_data/utils/census.py @@ -5,6 +5,61 @@ import numpy as np +STATE_NAME_TO_FIPS = { + "Alabama": "01", + "Alaska": "02", + "Arizona": "04", + "Arkansas": "05", + "California": "06", + "Colorado": "08", + "Connecticut": "09", + "Delaware": "10", + "District of Columbia": "11", + "Florida": "12", + "Georgia": "13", + "Hawaii": "15", + "Idaho": "16", + "Illinois": "17", + "Indiana": "18", + "Iowa": "19", + "Kansas": "20", + "Kentucky": "21", + "Louisiana": "22", + "Maine": "23", + "Maryland": "24", + "Massachusetts": "25", + "Michigan": "26", + "Minnesota": "27", + "Mississippi": "28", + "Missouri": "29", + "Montana": "30", + "Nebraska": "31", + "Nevada": "32", + "New Hampshire": "33", + "New Jersey": "34", + "New Mexico": "35", + "New York": "36", + "North Carolina": "37", + "North Dakota": "38", + "Ohio": "39", + "Oklahoma": "40", + "Oregon": "41", + "Pennsylvania": "42", + "Rhode Island": "44", + "South Carolina": "45", + "South Dakota": "46", + "Tennessee": "47", + "Texas": "48", + "Utah": "49", + "Vermont": "50", + "Virginia": "51", + "Washington": "53", + "West Virginia": "54", + "Wisconsin": "55", + "Wyoming": "56", +} + + def get_census_docs(year): docs_url = ( f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json" From e45072e9cd4dc5b30fcb81714bda359a753b05fc Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Tue, 12 Aug 2025 16:50:14 -0400 Subject: [PATCH 11/27] progress --- .../db/{load_age_targets.py => etl_age.py} | 4 - ...tl_irs_soi_targets.py => etl_eitc_only.py} | 0 policyengine_us_data/db/etl_irs_soi.py | 419 +++++++++++ policyengine_us_data/db/load_soi_targets.py | 672 ------------------ 4 files changed, 419 insertions(+), 676 deletions(-) rename policyengine_us_data/db/{load_age_targets.py => etl_age.py} (99%) rename policyengine_us_data/db/{etl_irs_soi_targets.py => etl_eitc_only.py} (100%) create mode 100644 policyengine_us_data/db/etl_irs_soi.py delete mode 100644 policyengine_us_data/db/load_soi_targets.py diff --git a/policyengine_us_data/db/load_age_targets.py b/policyengine_us_data/db/etl_age.py similarity index 99% rename from policyengine_us_data/db/load_age_targets.py rename to policyengine_us_data/db/etl_age.py index f5142e17..e168317b 100644 --- a/policyengine_us_data/db/load_age_targets.py +++ b/policyengine_us_data/db/etl_age.py @@ -1,4 +1,3 @@ -import logging import requests from pathlib import Path import io @@ -15,9 +14,6 @@ ) -logger = logging.getLogger(__name__) - - LABEL_TO_SHORT = { "Estimate!!Total!!Total population!!AGE!!Under 5 years": "0-4", "Estimate!!Total!!Total population!!AGE!!5 to 9 years": "5-9", diff --git a/policyengine_us_data/db/etl_irs_soi_targets.py b/policyengine_us_data/db/etl_eitc_only.py similarity index 100% rename from policyengine_us_data/db/etl_irs_soi_targets.py rename to policyengine_us_data/db/etl_eitc_only.py diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py new file mode 100644 index 00000000..d9eeb503 --- /dev/null +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -0,0 +1,419 @@ +from pathlib import Path +from typing import List, Optional, Sequence, Dict, Tuple, Any, Union + +import numpy as np +import pandas as pd + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from policyengine_us_data.db.create_database_tables import ( + Stratum, + StratumConstraint, + Target, +) + + + + +"""Utilities to pull AGI targets from the IRS SOI data files.""" + +# Congressional districts have one fewer level than the national and state +# They're missing the million plus category +# ("No AGI Stub") is a specific, intentional category used by the IRS in its summary data files. +# +#SOI_COLUMNS = [ +# "Under $1", +# "$1 under $10,000", +# "$10,000 under $25,000", +# "$25,000 under $50,000", +# "$50,000 under $75,000", +# "$75,000 under $100,000", +# "$100,000 under $200,000", +# "$200,000 under $500,000", +# "$500,000 or more", +#] +# +#AGI_STUB_TO_BAND = {i + 1: band for i, band in enumerate(SOI_COLUMNS)} +# +#AGI_BOUNDS = { +# "Under $1": (-np.inf, 1), +# "$1 under $10,000": (1, 10_000), +# "$10,000 under $25,000": (10_000, 25_000), +# "$25,000 under $50,000": (25_000, 50_000), +# "$50,000 under $75,000": (50_000, 75_000), +# "$75,000 under $100,000": (75_000, 100_000), +# "$100,000 under $200,000": (100_000, 200_000), +# "$200,000 under $500,000": (200_000, 500_000), +# "$500,000 or more": (500_000, np.inf), +#} +# +##NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"} +# +IGNORE_GEO_IDS = { + "0400000US72", # Puerto Rico (state level) + "5001800US7298", # Puerto Rico + "5001800US6098", # American Samoa + "5001800US6698", # Guam + "5001800US6998", # Northern Mariana Islands + "5001800US7898", # U.S. Virgin Islands +} + + +def create_records(df, breakdown_variable, target_variable): + """Transforms a DataFrame subset into a standardized list of records.""" + temp_df = df[["ucgid_str"]].copy() + temp_df["breakdown_variable"] = breakdown_variable + temp_df["breakdown_value"] = df[breakdown_variable] + temp_df["target_variable"] = target_variable + temp_df["target_value"] = df[target_variable] + return temp_df + + +def make_records( + df: pd.DataFrame, + *, + count_col: str, + amount_col: str, + amount_name: str, + breakdown_col: Optional[str] = None, + multiplier: int = 1_000, +): + df = ( + df.rename({count_col: "tax_unit_count", + amount_col: amount_name}, + axis=1) + .copy() + ) + + if breakdown_col is None: + breakdown_col = "one" + df[breakdown_col] = 1 + + rec_counts = create_records(df, breakdown_col, "tax_unit_count") + rec_amounts = create_records(df, breakdown_col, amount_name) + rec_amounts["target_value"] *= multiplier # Only the amounts get * 1000 + rec_counts["target_variable"] = f"{amount_name}_tax_unit_count" + + return rec_counts, rec_amounts + + +def make_agi_long(df: pd.DataFrame) -> pd.DataFrame: + """Convert IRS SOI AGI‑split table from wide to the long format used""" + target_col_map = { + "N1": "agi_tax_unit_count", + "N2": "agi_person_count", + "A00100": "agi_total_amount", + } + work = ( + df[["ucgid_str", "agi_stub"] + list(target_col_map)] + .rename(columns=target_col_map) + ) + long = ( + work.melt( + id_vars=["ucgid_str", "agi_stub"], + var_name="target_variable", + value_name="target_value" + ) + .rename(columns={"agi_stub": "breakdown_value"}) + .assign(breakdown_variable="agi_stub") + ) + long = long[["ucgid_str", + "breakdown_variable", + "breakdown_value", + "target_variable", + "target_value"]] + return ( + long.sort_values(["ucgid_str", "breakdown_value", "target_variable"]) + .reset_index(drop=True) + ) + + +def extract_soi_data() -> pd.DataFrame: + """Download and save congressional district AGI totals. + + In the file below, "22" is 2022, "in" is individual returns, + "cd" is congressional districts + """ + return pd.read_csv("https://www.irs.gov/pub/irs-soi/22incd.csv") + + +def transform_soi_data(raw_df): + + TARGETS = [ + dict(code="59661", name="eitc", breakdown=("eitc_children", 0)), + dict(code="59662", name="eitc", breakdown=("eitc_children", 1)), + dict(code="59663", name="eitc", breakdown=("eitc_children", 2)), + dict(code="59664", name="eitc", breakdown=("eitc_children", "3+")), + dict(code="59664", name="qbid", breakdown=None), + dict(code="18500", name="real_estate_taxes", breakdown=None), + dict(code="01000", name="net_capital_gain", breakdown=None), + dict(code="03150", name="ira_payments", breakdown=None), + dict(code="00300", name="taxable_interest", breakdown=None), + dict(code="00400", name="tax_exempt_interest", breakdown=None), + dict(code="00600", name="oridinary_dividends", breakdown=None), + dict(code="00650", name="qualified_dividends", breakdown=None), + dict(code="26270", name="partnership_and_s_crop_net_income", breakdown=None), + dict(code="02500", name="total_social_security", breakdown=None), + dict(code="01700", name="pension_and_annuities", breakdown=None), + dict(code="02300", name="unemployment_compensation", breakdown=None), + dict(code="00900", name="business_net_income", breakdown=None), + dict(code="17000", name="medical_and_dental_deduction", breakdown=None), + dict(code="00700", name="salt_refunds", breakdown=None), + dict(code="18425", name="salt_amount", breakdown=None), + dict(code="06500", name="income_tax", breakdown=None), + ] + + # National --------------- + national_df = raw_df.copy().loc[ + (raw_df.STATE == "US") + ] + national_df["ucgid_str"] = "0100000US" + + # State ------------------- + # You've got agi_stub == 0 in here, which you want to use any time you don't want to + # break things up by AGI + state_df = raw_df.copy().loc[ + (raw_df.STATE != "US") & + (raw_df.CONG_DISTRICT == 0) + ] + state_df["ucgid_str"] = "0400000US" + state_df["STATEFIPS"].astype(str).str.zfill(2) + + # District ------------------ + # This is going to fail because we're missing the single cong district states + district_df = raw_df.copy().loc[ + (raw_df.CONG_DISTRICT > 0) + ] + + max_cong_district_by_state = raw_df.groupby('STATE')['CONG_DISTRICT'].transform('max') + district_df = raw_df.copy().loc[ + (raw_df['CONG_DISTRICT'] > 0) | (max_cong_district_by_state == 0) + ] + district_df = district_df.loc[district_df['STATE'] != 'US'] + district_df["STATEFIPS"] = district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2) + district_df["CONG_DISTRICT"] = ( + district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2) + ) + district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"] + district_df = district_df[~district_df["ucgid_str"].isin(IGNORE_GEO_IDS)] + + assert district_df.shape[0] % 436 == 0 + + all_df = pd.concat([national_df, state_df, district_df]) + + # "Marginal" over AGI bands, which this data set is organized according to + all_marginals = all_df.copy().loc[all_df.agi_stub == 0] + assert all_marginals.shape[0] == 436 + 51 + 1 + + # Collect targets from the SOI file + records = [] + for spec in TARGETS: + count_col = f"N{spec['code']}" # e.g. 'N59661' + amount_col = f"A{spec['code']}" # e.g. 'A59661' + + df = all_marginals.copy() + + if spec["breakdown"] is not None: + col, val = spec["breakdown"] + df[col] = val + breakdown_col = col + else: + breakdown_col = None + + rec_counts, rec_amounts = make_records( + df, + count_col = count_col, + amount_col = amount_col, + amount_name = spec["name"], + breakdown_col = breakdown_col, + multiplier = 1_000, + ) + records.extend([rec_counts, rec_amounts]) + + + # AGI Processing (separate, doesn't have a count column) + temp_df = df[["ucgid_str"]].copy() + temp_df["breakdown_variable"] = "one" + temp_df["breakdown_value"] = 1 + temp_df["target_variable"] = "agi" + temp_df["target_value"] = df["A00100"] * 1_000 + + records.append(temp_df) + + # Note: national counts only have agi_stub = 0 + all_agi_splits = all_df.copy().loc[all_df.agi_stub != 0] + assert all_agi_splits.shape[0] % (436 + 51 + 0) == 0 + + agi_long = make_agi_long(all_agi_splits) + agi_long = agi_long.loc[agi_long.target_variable != "agi_total_amount"] + + records.append(agi_long) + + return pd.concat(records) + + +def load_soi_data(long_dfs, year): + + DATABASE_URL = "sqlite:///policy_data.db" + engine = create_engine(DATABASE_URL) + + Session = sessionmaker(bind=engine) + session = Session() + + # Load EITC data -------------------------------------------------------- + # NOTE: obviously this is not especially robust --- + eitc_data = {'0': (long_dfs[0], long_dfs[1]), + '1': (long_dfs[2], long_dfs[3]), + '2': (long_dfs[4], long_dfs[5]), + '3+': (long_dfs[6], long_dfs[7])} + + stratum_lookup = {"State": {}, "District": {}} + for n_children in eitc_data.keys(): + eitc_count_i, eitc_amount_i = eitc_data[n_children] + for i in range(eitc_count_i.shape[0]): + ucgid_i = eitc_count_i[['ucgid_str']].iloc[i].values[0] + note = f"Geo: {ucgid_i}, EITC received with {n_children} children" + + if len(ucgid_i) == 9: # National. + new_stratum = Stratum( + parent_stratum_id=None, stratum_group_id=0, notes=note + ) + elif len(ucgid_i) == 11: # State + new_stratum = Stratum( + parent_stratum_id=stratum_lookup["National"], + stratum_group_id=0, + notes=note + ) + elif len(ucgid_i) == 13: # District + new_stratum = Stratum( + parent_stratum_id=stratum_lookup["State"]['0400000US' + ucgid_i[9:11]], + stratum_group_id=0, + notes=note + ) + + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=ucgid_i, + ), + ] + if n_children == "3+": + new_stratum.constraints_rel.append( + StratumConstraint( + constraint_variable="eitc_children", + operation="greater_than_or_equal_to", + value='3', + ) + ) + else: + new_stratum.constraints_rel.append( + StratumConstraint( + constraint_variable="eitc_children", + operation="equals", + value=f'{n_children}', + ) + ) + + new_stratum.targets_rel = [ + Target( + variable="tax_unit_count", + period=year, + value=eitc_count_i.iloc[i][["target_value"]].values[0], + source_id=5, + active=True, + ), + Target( + variable="eitc", + period=year, + value=eitc_amount_i.iloc[i][["target_value"]].values[0], + source_id=5, + active=True, + ) + ] + + session.add(new_stratum) + session.flush() + + if len(ucgid_i) == 9: + stratum_lookup["National"] = new_stratum.stratum_id + elif len(ucgid_i) == 11: + stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id + + + # No breakdown variables in this set + for j in range(8, 42, 2): + print(long_dfs[j]) # count + print(long_dfs[j + 1]) # amount + + # Why are we making strata here? You have a lot of these to run through + count_j, amount_j = long_dfs[j], long_dfs[j + 1] + for i in range(count_j.shape[0]): + ucgid_i = count_j[['ucgid_str']].iloc[i].values[0] + # If there's no breakdown variable, is this a new geo? + # The problem is, it's vary difficult to search for a geography + # That's already in existance + note = f"Geo: {ucgid_i}" + + if len(ucgid_i) == 9: # National. + new_stratum = Stratum( + parent_stratum_id=None, stratum_group_id=0, notes=note + ) + elif len(ucgid_i) == 11: # State + new_stratum = Stratum( + parent_stratum_id=stratum_lookup["National"], + stratum_group_id=0, + notes=note + ) + elif len(ucgid_i) == 13: # District + new_stratum = Stratum( + parent_stratum_id=stratum_lookup["State"]['0400000US' + ucgid_i[9:11]], + stratum_group_id=0, + notes=note + ) + + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=ucgid_i, + ), + ] + new_stratum.targets_rel = [ + Target( + variable="tax_unit_count", + period=year, + value=count_j.iloc[i][["target_value"]].values[0], + source_id=5, + active=True, + ), + Target( + variable=amount_j.iloc[0][["target_variable"]].values[0], + period=year, + value=amount_j.iloc[i][["target_value"]].values[0], + source_id=5, + active=True, + ) + ] + + session.add(new_stratum) + session.flush() + + if len(ucgid_i) == 9: + stratum_lookup["National"] = new_stratum.stratum_id + elif len(ucgid_i) == 11: + stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id + + session.commit() + + + +def main() -> None: + year = 2022 # NOTE: predates the finalization of the 2020 Census redistricting + raw_df = extract_soi_data() + + long_dfs = transform_soi_data(raw_df): + + +if __name__ == "__main__": + main() diff --git a/policyengine_us_data/db/load_soi_targets.py b/policyengine_us_data/db/load_soi_targets.py deleted file mode 100644 index 2fe3fa91..00000000 --- a/policyengine_us_data/db/load_soi_targets.py +++ /dev/null @@ -1,672 +0,0 @@ -# This is the file where we actually get the SOI information that we want: - -# Goal: start with raw AGI and EITC: -# Data Dictionary: https://www.irs.gov/pub/irs-soi/22incddocguide.docx -# The Data: https://www.irs.gov/pub/irs-soi/22incd.csv - -from pathlib import Path -from typing import List, Optional, Sequence, Dict, Tuple, Any, Union - -import numpy as np -import pandas as pd -import logging - -from policyengine_us_data.storage import CALIBRATION_FOLDER - -logger = logging.getLogger(__name__) - -"""Utilities to pull AGI targets from the IRS SOI data files.""" - -# Congressional districts have one fewer level than the national and state -# They're missing the million plus category -# ("No AGI Stub") is a specific, intentional category used by the IRS in its summary data files. -SOI_COLUMNS = [ - "Under $1", - "$1 under $10,000", - "$10,000 under $25,000", - "$25,000 under $50,000", - "$50,000 under $75,000", - "$75,000 under $100,000", - "$100,000 under $200,000", - "$200,000 under $500,000", - "$500,000 or more", -] - -AGI_STUB_TO_BAND = {i + 1: band for i, band in enumerate(SOI_COLUMNS)} - -AGI_BOUNDS = { - "Under $1": (-np.inf, 1), - "$1 under $10,000": (1, 10_000), - "$10,000 under $25,000": (10_000, 25_000), - "$25,000 under $50,000": (25_000, 50_000), - "$50,000 under $75,000": (50_000, 75_000), - "$75,000 under $100,000": (75_000, 100_000), - "$100,000 under $200,000": (100_000, 200_000), - "$200,000 under $500,000": (200_000, 500_000), - "$500,000 or more": (500_000, np.inf), -} - -#NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"} - -IGNORE_GEO_IDS = { - "0400000US72", # Puerto Rico (state level) - "5001800US7298", # Puerto Rico - "5001800US6098", # American Samoa - "5001800US6698", # Guam - "5001800US6998", # Northern Mariana Islands - "5001800US7898", # U.S. Virgin Islands -} - -# after skipping the first 7 rows, the national SOI file has targets as row indices [COUNT_INDEX, AMOUNT_INDEX] -NATIONAL_VARIABLES = { - "adjusted_gross_income": [0, 17], -} - -# the state and district SOI file have targets as column names [COUNT_COL_NAME, AMOUNT_COL_NAME] -GEOGRAPHY_VARIABLES = {"adjusted_gross_income": ["N1", "A00100"]} - -STATE_ABBR_TO_FIPS = { - "AL": "01", - "AK": "02", - "AZ": "04", - "AR": "05", - "CA": "06", - "CO": "08", - "CT": "09", - "DC": "11", - "DE": "10", - "FL": "12", - "GA": "13", - "HI": "15", - "ID": "16", - "IL": "17", - "IN": "18", - "IA": "19", - "KS": "20", - "KY": "21", - "LA": "22", - "ME": "23", - "MD": "24", - "MA": "25", - "MI": "26", - "MN": "27", - "MS": "28", - "MO": "29", - "MT": "30", - "NE": "31", - "NV": "32", - "NH": "33", - "NJ": "34", - "NM": "35", - "NY": "36", - "NC": "37", - "ND": "38", - "OH": "39", - "OK": "40", - "OR": "41", - "PA": "42", - "RI": "44", - "SC": "45", - "SD": "46", - "TN": "47", - "TX": "48", - "UT": "49", - "VT": "50", - "VA": "51", - "WA": "53", - "WV": "54", - "WI": "55", - "WY": "56", -} -FIPS_TO_STATE_ABBR = {v: k for k, v in STATE_ABBR_TO_FIPS.items()} - - -#def pull_national_soi_variable( -# soi_variable_ident: int, # the national SOI xlsx file has a row for each target variable -# variable_name: Union[str, None], -# is_count: bool, -# national_df: Optional[pd.DataFrame] = None, -#) -> pd.DataFrame: -# """Download and save national AGI totals.""" -# df = pd.read_excel( -# "https://www.irs.gov/pub/irs-soi/22in54us.xlsx", skiprows=7 -# ) -# -# assert ( -# np.abs( -# df.iloc[soi_variable_ident, 1] -# - df.iloc[soi_variable_ident, 2:12].sum() -# ) -# < 100 -# ), "Row 0 doesn't add up — check the file." -# -# agi_values = df.iloc[soi_variable_ident, 2:12].astype(int).to_numpy() -# agi_values = np.concatenate( -# [agi_values[:8], [agi_values[8] + agi_values[9]]] -# ) -# -# agi_brackets = [ -# AGI_STUB_TO_BAND[i] for i in range(1, len(SOI_COLUMNS) + 1) -# ] -# -# result = pd.DataFrame( -# { -# "GEO_ID": ["0100000US"] * len(agi_brackets), -# "GEO_NAME": ["national"] * len(agi_brackets), -# "LOWER_BOUND": [AGI_BOUNDS[b][0] for b in agi_brackets], -# "UPPER_BOUND": [AGI_BOUNDS[b][1] for b in agi_brackets], -# "VALUE": agi_values, -# } -# ) -# -# # final column order -# result = result[ -# ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] -# ] -# result["IS_COUNT"] = int(is_count) -# result["VARIABLE"] = variable_name -# -# result["VALUE"] = np.where( -# result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"] -# ) -# -# if national_df is not None: -# # If a DataFrame is passed, we append the new data to it. -# df = pd.concat([national_df, result], ignore_index=True) -# return df -# -# return result -# -# -#def pull_state_soi_variable( -# soi_variable_ident: str, # the state SOI csv file has a column for each target variable -# variable_name: Union[str, None], -# is_count: bool, -# state_df: Optional[pd.DataFrame] = None, -#) -> pd.DataFrame: -# """Download and save state AGI totals.""" -# df = pd.read_csv( -# "https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv", thousands="," -# ) -# -# merged = ( -# df[df["AGI_STUB"].isin([9, 10])] -# .groupby("STATE", as_index=False) -# .agg({soi_variable_ident: "sum"}) -# .assign(AGI_STUB=9) -# ) -# df = df[~df["AGI_STUB"].isin([9, 10])] -# df = pd.concat([df, merged], ignore_index=True) -# df = df[df["AGI_STUB"] != 0] -# -# df["agi_bracket"] = df["AGI_STUB"].map(AGI_STUB_TO_BAND) -# -# df["state_abbr"] = df["STATE"] -# df["GEO_ID"] = "0400000US" + df["state_abbr"].map(STATE_ABBR_TO_FIPS) -# df["GEO_NAME"] = "state_" + df["state_abbr"] -# -# result = df.loc[ -# ~df["STATE"].isin(NON_VOTING_STATES.union({"US"})), -# ["GEO_ID", "GEO_NAME", "agi_bracket", soi_variable_ident], -# ].rename(columns={soi_variable_ident: "VALUE"}) -# -# result["LOWER_BOUND"] = result["agi_bracket"].map( -# lambda b: AGI_BOUNDS[b][0] -# ) -# result["UPPER_BOUND"] = result["agi_bracket"].map( -# lambda b: AGI_BOUNDS[b][1] -# ) -# -# # final column order -# result = result[ -# ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"] -# ] -# result["IS_COUNT"] = int(is_count) -# result["VARIABLE"] = variable_name -# -# result["VALUE"] = np.where( -# result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"] -# ) -# -# if state_df is not None: -# # If a DataFrame is passed, we append the new data to it. -# df = pd.concat([state_df, result], ignore_index=True) -# return df -# -# return result - -def create_records(df, breakdown_variable, target_variable): - """Transforms a DataFrame subset into a standardized list of records.""" - temp_df = df[["ucgid_str"]].copy() - temp_df["breakdown_variable"] = breakdown_variable - temp_df["breakdown_value"] = df[breakdown_variable] - temp_df["target_variable"] = target_variable - temp_df["target_value"] = df[target_variable] - return temp_df - - -def make_records( - df: pd.DataFrame, - *, - count_col: str, - amount_col: str, - amount_name: str, - breakdown_col: Optional[str] = None, - multiplier: int = 1_000, -): - df = ( - df.rename({count_col: "tax_unit_count", - amount_col: amount_name}, - axis=1) - .copy() - ) - - if breakdown_col is None: - breakdown_col = "one" - df[breakdown_col] = 1 - - rec_counts = create_records(df, breakdown_col, "tax_unit_count") - rec_amounts = create_records(df, breakdown_col, amount_name) - rec_amounts["target_value"] *= multiplier # Only the amounts get * 1000 - rec_counts["target_variable"] = f"{amount_name}_tax_unit_count" - - return rec_counts, rec_amounts - - - -_TARGET_COL_MAP = { - "N1": "agi_tax_unit_count", # number of returns (≈ “tax units”) - "N2": "agi_person_count", # number of individuals - "A00100": "agi_total_amount", # total Adjusted Gross Income -} - -_BREAKDOWN_FIELD = "agi_stub" # numeric AGI stub 1‑10 from IRS -_BREAKDOWN_NAME = "agi_stub" # what will go in `breakdown_variable` - -def make_agi_long(df: pd.DataFrame) -> pd.DataFrame: - """ - Convert IRS SOI AGI‑split table from wide to the long format used - in your `records[*]` list. - - Parameters - ---------- - df : DataFrame - Must contain `ucgid_str`, `agi_stub` and the three IRS fields - in `_TARGET_COL_MAP` (N1, N2, A00100). - - Returns - ------- - DataFrame with columns: - ucgid_str - breakdown_variable (always "agi_stub") - breakdown_value (1‑10) - target_variable ("agi_tax_unit_count" | "agi_person_count" | "agi_total_amount") - target_value (float) - """ - # — keep only what we need and rename for clarity - work = ( - df[["ucgid_str", _BREAKDOWN_FIELD] + list(_TARGET_COL_MAP)] - .rename(columns=_TARGET_COL_MAP) # N1 → agi_tax_unit_count, etc. - ) - - # — wide → long - long = ( - work.melt( - id_vars=["ucgid_str", _BREAKDOWN_FIELD], - var_name="target_variable", - value_name="target_value" - ) - .rename(columns={_BREAKDOWN_FIELD: "breakdown_value"}) - .assign(breakdown_variable=_BREAKDOWN_NAME) - # Optional: add a human‑readable band label if useful - # .assign(breakdown_label=lambda d: d["breakdown_value"].map(AGI_STUB_TO_BAND)) - ) - - # — final column order - long = long[["ucgid_str", - "breakdown_variable", - "breakdown_value", - "target_variable", - "target_value"]] - - # consistently sort (purely cosmetic) - return ( - long.sort_values(["ucgid_str", "breakdown_value", "target_variable"]) - .reset_index(drop=True) - ) - - -def extract_soi_data() -> pd.DataFrame: - """Download and save congressional district AGI totals. - - In the file below, "22" is 2022, "in" is individual returns, - "cd" is congressional districts - """ - return pd.read_csv("https://www.irs.gov/pub/irs-soi/22incd.csv") - - -raw_df = extract_soi_data() -# a "stub" is a term the IRS uses for a predefined category or group, specifically an income bracket. - -TARGETS = [ - dict(code="59661", name="eitc", breakdown=("eitc_children", 0)), - dict(code="59662", name="eitc", breakdown=("eitc_children", 1)), - dict(code="59663", name="eitc", breakdown=("eitc_children", 2)), - dict(code="59664", name="eitc", breakdown=("eitc_children", "3+")), - dict(code="59664", name="qbid", breakdown=None), - dict(code="18500", name="real_estate_taxes", breakdown=None), - dict(code="01000", name="net_capital_gain", breakdown=None), - dict(code="03150", name="ira_payments", breakdown=None), - dict(code="00300", name="taxable_interest", breakdown=None), - dict(code="00400", name="tax_exempt_interest", breakdown=None), - dict(code="00600", name="oridinary_dividends", breakdown=None), - dict(code="00650", name="qualified_dividends", breakdown=None), - dict(code="26270", name="partnership_and_s_crop_net_income", breakdown=None), - dict(code="02500", name="total_social_security", breakdown=None), - dict(code="01700", name="pension_and_annuities", breakdown=None), - dict(code="02300", name="unemployment_compensation", breakdown=None), - dict(code="00900", name="business_net_income", breakdown=None), - dict(code="17000", name="medical_and_dental_deduction", breakdown=None), - dict(code="00700", name="salt_refunds", breakdown=None), - dict(code="18425", name="salt_amount", breakdown=None), - dict(code="06500", name="income_tax", breakdown=None), -] - - - -def transform_soi_data(raw_df) - - - # agi_stub is only 0, so there are only agi breakdowns at the state level - # So you can confirm summability for 0 and then forget that national exists - # Honestly I think that's a better idea in general. If your states don't add - # Up to your national, something's off and you should treat it as an immediate - # problem to fix rather than something to be adjusted - national_df = raw_df.copy().loc[ - (raw_df.STATE == "US") - ] - national_df["ucgid_str"] = "0100000US" - - # You've got agi_stub == 0 in here, which you want to use any time you don't want to - # break things up by AGI - state_df = raw_df.copy().loc[ - (raw_df.STATE != "US") & - (raw_df.CONG_DISTRICT == 0) - ] - state_df["ucgid_str"] = "0400000US" + state_df["STATEFIPS"].astype(str).str.zfill(2) - - # This is going to fail because we're missing the single cong district states - district_df = raw_df.copy().loc[ - (raw_df.CONG_DISTRICT > 0) - ] - - max_cong_district_by_state = raw_df.groupby('STATE')['CONG_DISTRICT'].transform('max') - district_df = raw_df.copy().loc[ - (raw_df['CONG_DISTRICT'] > 0) | (max_cong_district_by_state == 0) - ] - district_df = district_df.loc[district_df['STATE'] != 'US'] - district_df["STATEFIPS"] = district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2) - district_df["CONG_DISTRICT"] = ( - district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2) - ) - district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"] - district_df = district_df[~district_df["ucgid_str"].isin(IGNORE_GEO_IDS)] - - assert district_df.shape[0] % 436 == 0 - - # And you've got everything you need for all 3 levels of targets: - # 1. national_df - # 2. state_df - # 3. district_df - - all_df = pd.concat([national_df, state_df, district_df]) - - # So I want to get 2 variable categories out of this thing, in long format - # 1) EITC, and 2) AGI - # There's eitc_child_count, eitc. There's person_count and tax_unit_count - # but no household_count. That's why you're doing this though, for a great example - # Wide (a new variable per number of children) or Long (breakdown variable is number of children) - - # Marginal in terms of AGI, which this data set is organized with respect to - all_marginals = all_df.copy().loc[all_df.agi_stub == 0] - assert all_marginals.shape[0] == 436 + 51 + 1 - - # Collect targets from the SOI file - records = [] - for spec in TARGETS: - count_col = f"N{spec['code']}" # e.g. 'N59661' - amount_col = f"A{spec['code']}" # e.g. 'A59661' - - df = all_marginals.copy() - - if spec["breakdown"] is not None: - col, val = spec["breakdown"] - df[col] = val - breakdown_col = col - else: - breakdown_col = None - - rec_counts, rec_amounts = make_records( - df, - count_col = count_col, - amount_col = amount_col, - amount_name = spec["name"], - breakdown_col = breakdown_col, - multiplier = 1_000, - ) - records.extend([rec_counts, rec_amounts]) - - - # Custom AGI amount, which doesn't have a count column (it has N1 and N2) - temp_df = df[["ucgid_str"]].copy() - temp_df["breakdown_variable"] = "one" - temp_df["breakdown_value"] = 1 - temp_df["target_variable"] = "agi" - temp_df["target_value"] = df["A00100"] * 1_000 - - records.append(temp_df) - - # It's notable that the national counts only have agi_stub = 0 - all_agi_splits = all_df.copy().loc[all_df.agi_stub != 0] - assert all_agi_splits.shape[0] % (436 + 51 + 0) == 0 - - # Still a bit of work to do at the time of loading, since the breakdown variable - # is agi_stub - agi_long = make_agi_long(all_agi_splits) - - # We have the distribution and the total amount, let's not go crazy here - agi_long = agi_long.loc[agi_long.target_variable != "agi_total_amount"] - - records.append(agi_long) - - return pd.concat(records) - - -def _get_soi_data(geo_level: str) -> pd.DataFrame: - """ - geo_level ∈ {'National', 'State', 'District'} - Returns a DataFrame with all SOI variables for the specified geography level - """ - if geo_level == "National": - var_indices = NATIONAL_VARIABLES - variable_pull = pull_national_soi_variable - elif geo_level == "State": - var_indices = GEOGRAPHY_VARIABLES - variable_pull = pull_state_soi_variable - elif geo_level == "District": - var_indices = GEOGRAPHY_VARIABLES - variable_pull = pull_district_soi_variable - else: - raise ValueError("geo_level must be National, State or District") - - df = pd.DataFrame() - for variable, identifiers in var_indices.items(): - count_id, amount_id = identifiers - # Pull count data (first identifier) - count_df = variable_pull( - soi_variable_ident=count_id, - variable_name=variable, - is_count=float(True), - ) - df = pd.concat([df, count_df], ignore_index=True) - # Pull amount data (second identifier) - amount_df = variable_pull( - soi_variable_ident=amount_id, - variable_name=variable, - is_count=float(False), - ) - df = pd.concat([df, amount_df], ignore_index=True) - - return df - - -def combine_geography_levels(districts: Optional[bool] = False) -> None: - """Combine SOI data across geography levels with validation and rescaling.""" - national = _get_soi_data("National") - state = _get_soi_data("State") - if districts: - district = _get_soi_data("District") - - # Add state FIPS codes for validation - state["STATEFIPS"] = state["GEO_ID"].str[-2:] - if districts: - district["STATEFIPS"] = district["GEO_ID"].str[-4:-2] - - # Get unique variables and AGI brackets for iteration - variables = national["VARIABLE"].unique() - agi_brackets = national[["LOWER_BOUND", "UPPER_BOUND"]].drop_duplicates() - - # Validate and rescale state totals against national totals - for variable in variables: - for is_count in [0.0, 1.0]: # Process count and amount separately - for _, bracket in agi_brackets.iterrows(): - lower, upper = ( - bracket["LOWER_BOUND"], - bracket["UPPER_BOUND"], - ) - - # Get national total for this variable/bracket/type combination - nat_mask = ( - (national["VARIABLE"] == variable) - & (national["LOWER_BOUND"] == lower) - & (national["UPPER_BOUND"] == upper) - & (national["IS_COUNT"] == is_count) - ) - us_total = national.loc[nat_mask, "VALUE"].iloc[0] - - # Get state total for this variable/bracket/type combination - state_mask = ( - (state["VARIABLE"] == variable) - & (state["LOWER_BOUND"] == lower) - & (state["UPPER_BOUND"] == upper) - & (state["IS_COUNT"] == is_count) - ) - state_total = state.loc[state_mask, "VALUE"].sum() - - # Rescale states if they don't match national total - if not np.isclose(state_total, us_total, rtol=1e-3): - count_type = "count" if is_count == 1.0 else "amount" - logger.warning( - f"States' sum does not match national total for {variable}/{count_type} " - f"in bracket [{lower}, {upper}]. Rescaling state targets." - ) - state.loc[state_mask, "VALUE"] *= us_total / state_total - - if districts: - # Validate and rescale district totals against state totals - for variable in variables: - for is_count in [0.0, 1.0]: # Process count and amount separately - for _, bracket in agi_brackets.iterrows(): - lower, upper = ( - bracket["LOWER_BOUND"], - bracket["UPPER_BOUND"], - ) - - # Create masks for this variable/bracket/type combination - state_mask = ( - (state["VARIABLE"] == variable) - & (state["LOWER_BOUND"] == lower) - & (state["UPPER_BOUND"] == upper) - & (state["IS_COUNT"] == is_count) - ) - district_mask = ( - (district["VARIABLE"] == variable) - & (district["LOWER_BOUND"] == lower) - & (district["UPPER_BOUND"] == upper) - & (district["IS_COUNT"] == is_count) - ) - - # Get state totals indexed by STATEFIPS - state_totals = state.loc[state_mask].set_index("STATEFIPS")[ - "VALUE" - ] - - # Get district totals grouped by STATEFIPS - district_totals = ( - district.loc[district_mask] - .groupby("STATEFIPS")["VALUE"] - .sum() - ) - - # Check and rescale districts for each state - for fips, d_total in district_totals.items(): - s_total = state_totals.get(fips) - - if s_total is not None and not np.isclose( - d_total, s_total, rtol=1e-3 - ): - count_type = "count" if is_count == 1.0 else "amount" - logger.warning( - f"Districts' sum does not match {fips} state total for {variable}/{count_type} " - f"in bracket [{lower}, {upper}]. Rescaling district targets." - ) - rescale_mask = district_mask & ( - district["STATEFIPS"] == fips - ) - district.loc[rescale_mask, "VALUE"] *= ( - s_total / d_total - ) - - # Combine all data - combined = pd.concat( - [ - national, - state.drop(columns="STATEFIPS"), - ( - district.drop(columns="STATEFIPS") - if districts - else pd.DataFrame(columns=national.columns) - ), - ], - ignore_index=True, - ).sort_values(["GEO_ID", "VARIABLE", "LOWER_BOUND"]) - - combined["DATA_SOURCE"] = "soi" - combined["BREAKDOWN_VARIABLE"] = "adjusted_gross_income" - - combined = combined[ - [ - "DATA_SOURCE", - "GEO_ID", - "GEO_NAME", - "VARIABLE", - "VALUE", - "IS_COUNT", - "BREAKDOWN_VARIABLE", - "LOWER_BOUND", - "UPPER_BOUND", - ] - ] - - # Save combined data - out_path = CALIBRATION_FOLDER / "soi.csv" - combined.to_csv(out_path, index=False) - logger.info(f"Combined SOI targets saved to {out_path}") - - -def main() -> None: - combine_geography_levels() - - -if __name__ == "__main__": - main() From 6d482e7fa50f4f5dfed9b1f9b2a514e9652505c9 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 14 Aug 2025 17:59:23 -0400 Subject: [PATCH 12/27] all major targets loaded --- Makefile | 6 +- .../db/create_initial_strata.py | 72 +++++ policyengine_us_data/db/etl_age.py | 101 ++----- policyengine_us_data/db/etl_eitc_only.py | 216 ------------- policyengine_us_data/db/etl_irs_soi.py | 284 ++++++++++++------ policyengine_us_data/db/etl_medicaid.py | 28 +- policyengine_us_data/db/etl_snap.py | 160 ++-------- policyengine_us_data/db/temp.py | 57 ++++ .../make_district_mapping.py | 254 ++++++++++++++++ policyengine_us_data/utils/census.py | 23 ++ policyengine_us_data/utils/db.py | 61 ++++ 11 files changed, 721 insertions(+), 541 deletions(-) create mode 100644 policyengine_us_data/db/create_initial_strata.py delete mode 100644 policyengine_us_data/db/etl_eitc_only.py create mode 100644 policyengine_us_data/db/temp.py create mode 100644 policyengine_us_data/storage/calibration_targets/make_district_mapping.py create mode 100644 policyengine_us_data/utils/db.py diff --git a/Makefile b/Makefile index 4124babc..01999135 100644 --- a/Makefile +++ b/Makefile @@ -61,7 +61,11 @@ documentation-dev: database: python policyengine_us_data/db/create_database_tables.py - python policyengine_us_data/db/load_age_targets.py + python policyengine_us_data/db/create_initial_strata.py + python policyengine_us_data/db/etl_age.py + python policyengine_us_data/db/etl_medicaid.py + python policyengine_us_data/db/etl_snap.py + python policyengine_us_data/db/etl_irs_soi.py clean-database: rm *.db diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py new file mode 100644 index 00000000..0a7e7f7a --- /dev/null +++ b/policyengine_us_data/db/create_initial_strata.py @@ -0,0 +1,72 @@ +from typing import Dict + +import pandas as pd +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from sqlmodel import SQLModel, Session, select + + +from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID +from policyengine_us_data.db.create_database_tables import ( + Stratum, + StratumConstraint, +) + + + +def main(): + # Get the implied hierarchy by the UCGID enum -------- + rows = [] + for node in UCGID: + codes = node.get_hierarchical_codes() + rows.append({ + "name": node.name, + "code": codes[0], + "parent": codes[1] if len(codes) > 1 else None + }) + + hierarchy_df = ( + pd.DataFrame(rows) + .sort_values(["parent", "code"], na_position="first") + .reset_index(drop=True) + ) + + + DATABASE_URL = "sqlite:///policy_data.db" + engine = create_engine(DATABASE_URL) + + Session = sessionmaker(bind=engine) + session = Session() + + # map the ucgid_str 'code' to auto-generated 'stratum_id' + code_to_stratum_id: Dict[str, int] = {} + + for _, row in hierarchy_df.iterrows(): + parent_code = row["parent"] + + parent_id = code_to_stratum_id.get(parent_code) if parent_code else None + + new_stratum = Stratum( + parent_stratum_id=parent_id, + notes=f'{row["name"]} (ucgid {row["code"]})', + stratum_group_id=1, + ) + + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=row["code"], + ) + ] + + session.add(new_stratum) + + session.flush() + + code_to_stratum_id[row["code"]] = new_stratum.stratum_id + + session.commit() + +if __name__ == "__main__": + main() diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index e168317b..084a43d6 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -12,6 +12,7 @@ StratumConstraint, Target, ) +from policyengine_us_data.utils.census import get_census_docs, pull_acs_table LABEL_TO_SHORT = { @@ -32,67 +33,11 @@ "Estimate!!Total!!Total population!!AGE!!70 to 74 years": "70-74", "Estimate!!Total!!Total population!!AGE!!75 to 79 years": "75-79", "Estimate!!Total!!Total population!!AGE!!80 to 84 years": "80-84", - "Estimate!!Total!!Total population!!AGE!!85 years and over": "85-inf", + "Estimate!!Total!!Total population!!AGE!!85 years and over": "85-999", } AGE_COLS = list(LABEL_TO_SHORT.values()) -def extract_docs(year=2023): - docs_url = ( - f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json" - ) - - try: - docs_response = requests.get(docs_url) - docs_response.raise_for_status() - - docs = docs_response.json() - docs["year"] = year - - except requests.exceptions.RequestException as e: - print(f"Error during API request: {e}") - raise - except Exception as e: - print(f"An error occurred: {e}") - raise - return docs - - -def extract_age_data(geo, year=2023): - base_url = ( - f"https://api.census.gov/data/{year}/acs/acs1/subject?get=group(S0101)" - ) - - if geo == "State": - url = f"{base_url}&for=state:*" - elif geo == "District": - url = f"{base_url}&for=congressional+district:*" - elif geo == "National": - url = f"{base_url}&for=us:*" - else: - raise ValueError( - "geo must be either 'National', 'State', or 'District'" - ) - - try: - response = requests.get(url) - response.raise_for_status() - - data = response.json() - - headers = data[0] - data_rows = data[1:] - df = pd.DataFrame(data_rows, columns=headers) - - except requests.exceptions.RequestException as e: - print(f"Error during API request: {e}") - raise - except Exception as e: - print(f"An error occurred: {e}") - raise - return df - - def transform_age_data(age_data, docs): df = age_data.copy() @@ -131,13 +76,14 @@ def transform_age_data(age_data, docs): var_name="age_range", value_name="value", ) - age_bounds = df_long["age_range"].str.split("-", expand=True) - df_long["age_greater_than_or_equal_to"] = ( - age_bounds[0].str.replace("+", "").astype(int) - ) - df_long["age_less_than_or_equal_to"] = pd.to_numeric(age_bounds[1]) + age_bounds = df_long["age_range"].str.split("-", expand=True).astype(int) + age_bounds.columns = ["ge", "le"] + age_bounds[['gt']] = age_bounds[["ge"]] - 1 + age_bounds[['lt']] = age_bounds[["le"]] + 1 + + df_long["age_greater_than"] = age_bounds[["gt"]] + df_long["age_less_than"] = age_bounds[["lt"]] df_long["variable"] = "person_count" - df_long["period"] = docs["year"] df_long["reform_id"] = 0 df_long["source_id"] = 1 df_long["active"] = True @@ -149,7 +95,7 @@ def get_parent_geo(geo): return {"National": None, "State": "National", "District": "State"}[geo] -def load_age_data(df_long, geo, stratum_lookup={}): +def load_age_data(df_long, geo, year, stratum_lookup={}): # Quick data quality check before loading ---- if geo == "National": @@ -192,6 +138,7 @@ def load_age_data(df_long, geo, stratum_lookup={}): ) # Create constraints and link them to the parent's relationship attribute. + # TODO: greater_than_or_equal_to to just greater than! new_stratum.constraints_rel = [ StratumConstraint( constraint_variable="ucgid_str", @@ -200,18 +147,18 @@ def load_age_data(df_long, geo, stratum_lookup={}): ), StratumConstraint( constraint_variable="age", - operation="greater_than_or_equal", - value=str(row["age_greater_than_or_equal_to"]), + operation="greater_than", + value=str(row["age_greater_than"]), ), ] - age_lt_value = row["age_less_than_or_equal_to"] + age_lt_value = row["age_less_than"] if not np.isinf(age_lt_value): new_stratum.constraints_rel.append( StratumConstraint( constraint_variable="age", operation="less_than", - value=str(age_lt_value + 1), + value=str(row["age_less_than"]), ) ) @@ -219,7 +166,7 @@ def load_age_data(df_long, geo, stratum_lookup={}): new_stratum.targets_rel.append( Target( variable=row["variable"], - period=row["period"], + period=year, value=row["value"], source_id=row["source_id"], active=row["active"], @@ -243,18 +190,24 @@ def load_age_data(df_long, geo, stratum_lookup={}): if __name__ == "__main__": # --- ETL: Extract, Transform, Load ---- + year = 2023 # ---- Extract ---------- - docs = extract_docs(2023) - national_df = extract_age_data("National", 2023) - state_df = extract_age_data("State", 2023) + docs = get_census_docs(year) + national_df = pull_acs_table("S0101", "National", year) + state_df = pull_acs_table("S0101", "State", year) + district_df = pull_acs_table("S0101", "District", year) # --- Transform ---------- long_national_df = transform_age_data(national_df, docs) long_state_df = transform_age_data(state_df, docs) + long_district_df = transform_age_data(district_df, docs) # --- Load -------- - national_strata_lku = load_age_data(long_national_df, "National") + national_strata_lku = load_age_data(long_national_df, "National", year) state_strata_lku = load_age_data( - long_state_df, "State", national_strata_lku + long_state_df, "State", year, national_strata_lku + ) + load_age_data( + long_district_df, "District", year, state_strata_lku ) diff --git a/policyengine_us_data/db/etl_eitc_only.py b/policyengine_us_data/db/etl_eitc_only.py deleted file mode 100644 index 20d52cef..00000000 --- a/policyengine_us_data/db/etl_eitc_only.py +++ /dev/null @@ -1,216 +0,0 @@ -import logging -import requests -from pathlib import Path -import io - -import pandas as pd -import numpy as np -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker - -from policyengine_us_data.db.create_database_tables import ( - Stratum, - StratumConstraint, - Target, -) - - -logger = logging.getLogger(__name__) - - -def extract_eitc_data(): - # IRS Table 2.5, Tax Year 2020S - url = "https://www.irs.gov/pub/irs-soi/20in25ic.xls" - r = requests.get(url, timeout=30) - r.raise_for_status() - - # Pandas uses xlrd to open .xls - xls = pd.ExcelFile(io.BytesIO(r.content), engine="xlrd") - sheets = {name: xls.parse(name, header=None) for name in xls.sheet_names} - - raw = sheets[xls.sheet_names[0]] - return raw - - -def transform_eitc_data(raw_data): - # This is not ideal from a data processing standpoint, but it's too much - # effort to fully parse this hierarchical XLS for a few data points - # At least the full lineage is represented from the source - - zero_children_returns = raw_data.iloc[8, 25] - zero_children_amount = raw_data.iloc[8, 26] * 1000 - - one_child_returns = raw_data.iloc[8, 39] - one_child_amount = raw_data.iloc[8, 40] * 1000 - - two_children_returns = raw_data.iloc[8, 57] - two_children_amount = raw_data.iloc[8, 58] * 1000 - - three_plus_children_returns = raw_data.iloc[8, 73] - three_plus_children_amount = raw_data.iloc[8, 74] * 1000 - - assert zero_children_returns == 7636714 - assert zero_children_amount == 2255068000 - - df_long = pd.DataFrame( - [ - [ - "0100000US", - "children_equal_to", - 0, - "tax_unit_count", - zero_children_returns, - ], - [ - "0100000US", - "children_equal_to", - 1, - "tax_unit_count", - one_child_returns, - ], - [ - "0100000US", - "children_equal_to", - 2, - "tax_unit_count", - two_children_returns, - ], - [ - "0100000US", - "children_greater_or_equal_to", - 3, - "tax_unit_count", - three_plus_children_returns, - ], - [ - "0100000US", - "children_equal_to", - 0, - "eitc", - zero_children_amount, - ], - ["0100000US", "children_equal_to", 1, "eitc", one_child_returns], - [ - "0100000US", - "children_equal_to", - 2, - "eitc", - two_children_returns, - ], - [ - "0100000US", - "children_greater_or_equal_to", - 3, - "eitc", - three_plus_children_returns, - ], - ] - ) - - df_long.columns = [ - "ucgid", - "constraint", - "constraint_value", - "variable", - "value", - ] - - df_long["period"] = 2020 - df_long["reform_id"] = 0 - df_long["source_id"] = 2 - df_long["active"] = True - - return df_long - - -def load_eitc_data(df_long): - - DATABASE_URL = "sqlite:///policy_data.db" - engine = create_engine(DATABASE_URL) - - Session = sessionmaker(bind=engine) - session = Session() - - ucgid = df_long.iloc[0]["ucgid"] - for num_children in [0, 1, 2, 3]: - note = f"eitc_child_count: {num_children}, Geo: {ucgid}" - new_stratum = Stratum( - parent_stratum_id=None, stratum_group_id=0, notes=note - ) - - new_stratum.constraints_rel = [ - StratumConstraint( - constraint_variable="ucgid", - operation="equals", - value=ucgid, - ), - ] - - if num_children <= 2: - new_stratum.constraints_rel.append( - StratumConstraint( - constraint_variable="eitc_child_count", - operation="equals", - value=str(num_children), - ), - ) - elif num_children > 2: - new_stratum.constraints_rel.append( - StratumConstraint( - constraint_variable="eitc_child_count", - operation="greater_or_equal_than", - value=str(3), - ), - ) - - rows = df_long.loc[df_long["constraint_value"] == num_children] - count_target = rows.loc[rows.variable == "tax_unit_count"][ - "value" - ].values[0] - amount_target = rows.loc[rows.variable == "eitc"]["value"].values[0] - - # Avoiding magic numbers in the load step - count_active = rows.loc[rows.variable == "tax_unit_count"][ - "active" - ].values[0] - amount_active = rows.loc[rows.variable == "eitc"]["active"].values[0] - - period = rows.iloc[0]["period"] - source_id = rows.iloc[0]["source_id"] - - new_stratum.targets_rel = [ - Target( - variable="eitc", - period=period, - value=amount_target, - source_id=source_id, - active=amount_active, - ), - Target( - variable="tax_unit_count", - period=period, - value=amount_target, - source_id=source_id, - active=count_active, - ), - ] - - session.add(new_stratum) - session.flush() - print(new_stratum.stratum_id) - - session.commit() - - -if __name__ == "__main__": - - # --- ETL: Extract, Transform, Load ---- - - # ---- Extract ---------- - national_df = extract_eitc_data() - - # --- Transform ---------- - long_national_df = transform_eitc_data(national_df) - - # --- Load -------- - state_strata_lku = load_eitc_data(long_national_df) diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index d9eeb503..c93eb593 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -1,65 +1,36 @@ -from pathlib import Path -from typing import List, Optional, Sequence, Dict, Tuple, Any, Union +from typing import Optional import numpy as np import pandas as pd -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker +from sqlmodel import Session, create_engine from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, Target, ) - - - - -"""Utilities to pull AGI targets from the IRS SOI data files.""" - -# Congressional districts have one fewer level than the national and state -# They're missing the million plus category -# ("No AGI Stub") is a specific, intentional category used by the IRS in its summary data files. -# -#SOI_COLUMNS = [ -# "Under $1", -# "$1 under $10,000", -# "$10,000 under $25,000", -# "$25,000 under $50,000", -# "$50,000 under $75,000", -# "$75,000 under $100,000", -# "$100,000 under $200,000", -# "$200,000 under $500,000", -# "$500,000 or more", -#] -# -#AGI_STUB_TO_BAND = {i + 1: band for i, band in enumerate(SOI_COLUMNS)} -# -#AGI_BOUNDS = { -# "Under $1": (-np.inf, 1), -# "$1 under $10,000": (1, 10_000), -# "$10,000 under $25,000": (10_000, 25_000), -# "$25,000 under $50,000": (25_000, 50_000), -# "$50,000 under $75,000": (50_000, 75_000), -# "$75,000 under $100,000": (75_000, 100_000), -# "$100,000 under $200,000": (100_000, 200_000), -# "$200,000 under $500,000": (200_000, 500_000), -# "$500,000 or more": (500_000, np.inf), -#} -# -##NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"} -# -IGNORE_GEO_IDS = { - "0400000US72", # Puerto Rico (state level) - "5001800US7298", # Puerto Rico - "5001800US6098", # American Samoa - "5001800US6698", # Guam - "5001800US6998", # Northern Mariana Islands - "5001800US7898", # U.S. Virgin Islands +from policyengine_us_data.utils.db import get_stratum_by_id, get_simple_stratum_by_ucgid, get_root_strata, get_stratum_children, get_stratum_parent +from policyengine_us_data.utils.census import TERRITORY_UCGIDS +from policyengine_us_data.storage.calibration_targets.make_district_mapping import get_district_mapping + + +"""See the 22incddocguide.docx manual from the IRS SOI""" +# Let's make this work with strict inequalities +# Interpret Language: '$10,000 under $25,000' +epsilon = 0.005 # Half a penny +AGI_STUB_TO_INCOME_RANGE = { + 1: (-np.inf, 1), + 2: (1 - epsilon, 10_000), + 3: (10_000 - epsilon , 25_000), + 4: (25_000 - epsilon, 50_000), + 5: (50_000 - epsilon, 75_000), + 6: (75_000 - epsilon, 100_000), + 7: (100_000 - epsilon, 200_000), + 8: (200_000 - epsilon, 500_000), + 9: (500_000 - epsilon, np.inf), } - def create_records(df, breakdown_variable, target_variable): """Transforms a DataFrame subset into a standardized list of records.""" temp_df = df[["ucgid_str"]].copy() @@ -123,10 +94,39 @@ def make_agi_long(df: pd.DataFrame) -> pd.DataFrame: "breakdown_value", "target_variable", "target_value"]] - return ( - long.sort_values(["ucgid_str", "breakdown_value", "target_variable"]) - .reset_index(drop=True) - ) + + return [ + df.sort_values(by='ucgid_str').reset_index(drop=True) + for name, df in long.groupby(['breakdown_value', 'target_variable']) + ] + + +def convert_district_data( + input_df: pd.DataFrame, + mapping_matrix: np.ndarray, # 436 x 436A + new_district_codes +) -> pd.DataFrame: + """Transforms data from pre- to post- 2020 census districts""" + df = input_df.copy() + old_districts_df = df[df['ucgid_str'].str.startswith("5001800US")].copy() + old_districts_df = old_districts_df.sort_values('ucgid_str').reset_index(drop=True) + old_values = old_districts_df['target_value'].to_numpy() + new_values = mapping_matrix.T @ old_values + + # Create a new DataFrame for the transformed data, preserving the original schema. + new_districts_df = pd.DataFrame({ + 'ucgid_str': new_district_codes, + 'breakdown_variable': old_districts_df['breakdown_variable'], + 'breakdown_value': old_districts_df['breakdown_value'], + 'target_variable': old_districts_df['target_variable'], + 'target_value': new_values + }) + + other_geos_df = df[~df['ucgid_str'].str.startswith("5001800US")].copy() + + final_df = pd.concat([other_geos_df, new_districts_df], ignore_index=True) + + return final_df def extract_soi_data() -> pd.DataFrame: @@ -195,7 +195,7 @@ def transform_soi_data(raw_df): district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2) ) district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"] - district_df = district_df[~district_df["ucgid_str"].isin(IGNORE_GEO_IDS)] + district_df = district_df[~district_df["ucgid_str"].isin(TERRITORY_UCGIDS)] assert district_df.shape[0] % 436 == 0 @@ -244,12 +244,15 @@ def transform_soi_data(raw_df): all_agi_splits = all_df.copy().loc[all_df.agi_stub != 0] assert all_agi_splits.shape[0] % (436 + 51 + 0) == 0 - agi_long = make_agi_long(all_agi_splits) - agi_long = agi_long.loc[agi_long.target_variable != "agi_total_amount"] + agi_long_records = make_agi_long(all_agi_splits) - records.append(agi_long) + records.extend(agi_long_records) - return pd.concat(records) + # Pre- to Post- 2020 Census redisticting + mapping = get_district_mapping() + converted = [convert_district_data(r, mapping['mapping_matrix'], mapping['new_codes']) for r in records] + + return converted def load_soi_data(long_dfs, year): @@ -257,11 +260,10 @@ def load_soi_data(long_dfs, year): DATABASE_URL = "sqlite:///policy_data.db" engine = create_engine(DATABASE_URL) - Session = sessionmaker(bind=engine) - session = Session() + session = Session(engine) # Load EITC data -------------------------------------------------------- - # NOTE: obviously this is not especially robust --- + # Obviously this is not especially robust --- eitc_data = {'0': (long_dfs[0], long_dfs[1]), '1': (long_dfs[2], long_dfs[3]), '2': (long_dfs[4], long_dfs[5]), @@ -302,8 +304,8 @@ def load_soi_data(long_dfs, year): new_stratum.constraints_rel.append( StratumConstraint( constraint_variable="eitc_children", - operation="greater_than_or_equal_to", - value='3', + operation="greater_than", + value='2', ) ) else: @@ -316,13 +318,14 @@ def load_soi_data(long_dfs, year): ) new_stratum.targets_rel = [ - Target( - variable="tax_unit_count", - period=year, - value=eitc_count_i.iloc[i][["target_value"]].values[0], - source_id=5, - active=True, - ), + # It's already complex enough + #Target( + # variable="tax_unit_count", + # period=year, + # value=eitc_count_i.iloc[i][["target_value"]].values[0], + # source_id=5, + # active=True, + #), Target( variable="eitc", period=year, @@ -340,26 +343,104 @@ def load_soi_data(long_dfs, year): elif len(ucgid_i) == 11: stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id + session.commit() # No breakdown variables in this set for j in range(8, 42, 2): - print(long_dfs[j]) # count - print(long_dfs[j + 1]) # amount - - # Why are we making strata here? You have a lot of these to run through count_j, amount_j = long_dfs[j], long_dfs[j + 1] + amount_variable_name = amount_j.iloc[0][["target_variable"]].values[0] + print(f"Loading amount data for IRS SOI data on {amount_variable_name}") for i in range(count_j.shape[0]): ucgid_i = count_j[['ucgid_str']].iloc[i].values[0] - # If there's no breakdown variable, is this a new geo? - # The problem is, it's vary difficult to search for a geography - # That's already in existance - note = f"Geo: {ucgid_i}" - if len(ucgid_i) == 9: # National. - new_stratum = Stratum( - parent_stratum_id=None, stratum_group_id=0, notes=note + # Reusing an existing stratum this time, since there is no breakdown + stratum = get_simple_stratum_by_ucgid(session, ucgid_i) + amount_value = amount_j.iloc[i][["target_value"]].values[0] + + stratum.targets_rel.append( + # NOTE: If I do the counts, I'm going to need to explode the strata for the vars != 0 + # OR, create new variables like qbid_tax_unit_count which requires adding stuff to -us + # AND, it's already complex enough ----- + #Target( + # variable="tax_unit_count", + # period=year, + # value=count_j.iloc[i][["target_value"]].values[0], + # source_id=5, + # active=True, + #), + Target( + variable=amount_variable_name, + period=year, + value=amount_value, + source_id=5, + active=True, ) - elif len(ucgid_i) == 11: # State + ) + + session.add(stratum) + session.flush() + + session.commit() + + # Adjusted Gross Income ------ + agi_values = long_dfs[42] + + for i in range(agi_values.shape[0]): + ucgid_i = agi_values[['ucgid_str']].iloc[i].values[0] + stratum = get_simple_stratum_by_ucgid(session, ucgid_i) + stratum.targets_rel.append( + Target( + variable="agi", + period=year, + value=agi_values.iloc[i][["target_value"]].values[0], + source_id=5, + active=True, + ) + ) + session.add(stratum) + session.flush() + + session.commit() + + agi_person_count_dfs = [df for df in long_dfs[43:] if df['target_variable'].iloc[0] == 'agi_person_count'] + + for agi_df in agi_person_count_dfs: + agi_stub = agi_df.iloc[0][["breakdown_value"]].values[0] + agi_income_lower, agi_income_upper = AGI_STUB_TO_INCOME_RANGE[agi_stub] + + # Make a National Stratum for each AGI Stub, even though there's no national target + # There no national target because the data set only has agi_stub = 0 for national + nat_stratum = Stratum( + parent_stratum_id=None, stratum_group_id=0, notes=note + ) + nat_stratum.constraints_rel.extend([ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=ucgid_i, + ), + StratumConstraint( + constraint_variable="agi", + operation="greater_than", + value=str(agi_income_lower), + ), + StratumConstraint( + constraint_variable="agi", + operation="less_than", + value=str(agi_income_upper), + ), + ]) + session.add(nat_stratum) + session.flush() + + stratum_lookup = {"National": nat_stratum.stratum_id, "State": {}, "District": {}} + for i in range(agi_df.shape[0]): + ucgid_i = agi_df[['ucgid_str']].iloc[i].values[0] + note = f"Geo: {ucgid_i}, AGI > {agi_income_lower}, AGI < {agi_income_upper}" + + person_count = agi_df.iloc[i][["target_value"]].values[0] + + if len(ucgid_i) == 11: # State new_stratum = Stratum( parent_stratum_id=stratum_lookup["National"], stratum_group_id=0, @@ -371,26 +452,28 @@ def load_soi_data(long_dfs, year): stratum_group_id=0, notes=note ) - - new_stratum.constraints_rel = [ + new_stratum.constraints_rel.extend([ StratumConstraint( constraint_variable="ucgid_str", operation="in", value=ucgid_i, ), - ] + StratumConstraint( + constraint_variable="agi", + operation="greater_than", + value=str(agi_income_lower), + ), + StratumConstraint( + constraint_variable="agi", + operation="less_than", + value=str(agi_income_upper), + ), + ]) new_stratum.targets_rel = [ Target( - variable="tax_unit_count", + variable="person_count", period=year, - value=count_j.iloc[i][["target_value"]].values[0], - source_id=5, - active=True, - ), - Target( - variable=amount_j.iloc[0][["target_variable"]].values[0], - period=year, - value=amount_j.iloc[i][["target_value"]].values[0], + value=person_count, source_id=5, active=True, ) @@ -407,12 +490,17 @@ def load_soi_data(long_dfs, year): session.commit() - -def main() -> None: +def main(): year = 2022 # NOTE: predates the finalization of the 2020 Census redistricting + + # Extract ----------------------- raw_df = extract_soi_data() - long_dfs = transform_soi_data(raw_df): + # Transform --------------------- + long_dfs = transform_soi_data(raw_df) + + # Load --------------------- + load_soi_data(long_dfs, year) if __name__ == "__main__": diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py index d1babe31..ec16ac71 100644 --- a/policyengine_us_data/db/etl_medicaid.py +++ b/policyengine_us_data/db/etl_medicaid.py @@ -9,22 +9,7 @@ StratumConstraint, Target, ) - - -# State abbreviation to FIPS code mapping -state_fips_map = { - 'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06', - 'CO': '08', 'CT': '09', 'DE': '10', 'FL': '12', 'GA': '13', - 'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19', - 'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24', - 'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28', 'MO': '29', - 'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34', - 'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39', - 'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45', - 'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50', - 'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55', 'WY': '56', - 'DC': '11' -} +from policyengine_us_data.utils.census import STATE_ABBREV_TO_FIPS def extract_medicaid_data(year): @@ -63,7 +48,7 @@ def transform_medicaid_data(state_admin_df, cd_survey_df, year): ["State Abbreviation", "Reporting Period", "Total Medicaid Enrollment"] ] - state_df["FIPS"] = state_df["State Abbreviation"].map(state_fips_map) + state_df["FIPS"] = state_df["State Abbreviation"].map(STATE_ABBREV_TO_FIPS) cd_df = cd_survey_df[["GEO_ID", "state", "congressional district", "S2704_C02_006E"]] @@ -100,7 +85,7 @@ def load_medicaid_data(long_state, long_cd, year): nat_stratum.constraints_rel = [ StratumConstraint( constraint_variable="ucgid_str", - operation="equals", + operation="in", value="0100000US", ), StratumConstraint( @@ -128,7 +113,7 @@ def load_medicaid_data(long_state, long_cd, year): new_stratum.constraints_rel = [ StratumConstraint( constraint_variable="ucgid_str", - operation="equals", + operation="in", value=row["ucgid_str"], ), StratumConstraint( @@ -162,7 +147,7 @@ def load_medicaid_data(long_state, long_cd, year): new_stratum.constraints_rel = [ StratumConstraint( constraint_variable="ucgid_str", - operation="equals", + operation="in", value=row["ucgid_str"], ), StratumConstraint( @@ -190,8 +175,11 @@ def load_medicaid_data(long_state, long_cd, year): year = 2023 + # Extract ------------------------------ cd_survey_df, state_admin_df = extract_medicaid_data(year) + # Transform ------------------- long_state, long_cd = transform_medicaid_data(state_admin_df, cd_survey_df, year) + # Load ----------------------- load_medicaid_data(long_state, long_cd, year) diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index a0f20133..f9a172a9 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -17,68 +17,11 @@ Target, ) from policyengine_us_data.utils.census import ( - get_census_docs, pull_acs_table, STATE_NAME_TO_FIPS, ) -STATE_NAME_TO_FIPS = { - "Alabama": "01", - "Alaska": "02", - "Arizona": "04", - "Arkansas": "05", - "California": "06", - "Colorado": "08", - "Connecticut": "09", - "Delaware": "10", - "District of Columbia": "11", - "Florida": "12", - "Georgia": "13", - "Hawaii": "15", - "Idaho": "16", - "Illinois": "17", - "Indiana": "18", - "Iowa": "19", - "Kansas": "20", - "Kentucky": "21", - "Louisiana": "22", - "Maine": "23", - "Maryland": "24", - "Massachusetts": "25", - "Michigan": "26", - "Minnesota": "27", - "Mississippi": "28", - "Missouri": "29", - "Montana": "30", - "Nebraska": "31", - "Nevada": "32", - "New Hampshire": "33", - "New Jersey": "34", - "New Mexico": "35", - "New York": "36", - "North Carolina": "37", - "North Dakota": "38", - "Ohio": "39", - "Oklahoma": "40", - "Oregon": "41", - "Pennsylvania": "42", - "Rhode Island": "44", - "South Carolina": "45", - "South Dakota": "46", - "Tennessee": "47", - "Texas": "48", - "Utah": "49", - "Vermont": "50", - "Virginia": "51", - "Washington": "53", - "West Virginia": "54", - "Wisconsin": "55", - "Wyoming": "56", -} - -# Administrative data ------------------------------------------------ - def extract_administrative_snap_data(year=2023): """ Downloads and extracts annual state-level SNAP data from the USDA FNS zip file. @@ -122,6 +65,10 @@ def extract_administrative_snap_data(year=2023): return zipfile.ZipFile(io.BytesIO(response.content)) +def extract_survey_snap_data(year): + return pull_acs_table("S2201", "District", year) + + def transform_administrative_snap_data(zip_file, year): filename = f"FY{str(year)[-2:]}.xlsx" with zip_file.open(filename) as f: @@ -185,6 +132,22 @@ def transform_administrative_snap_data(zip_file, year): return df_states +def transform_survey_snap_data(raw_df): + df = raw_df.copy() + return df[["GEO_ID", "S2201_C03_001E"]].rename({ + "GEO_ID": "ucgid_str", + "S2201_C03_001E": "snap_household_ct" + }, axis=1 + )[ + ~df["GEO_ID"].isin( + [ # Puerto Rico's state and district + "0400000US72", + "5001800US7298", + ] + ) + ] + + def load_administrative_snap_data(df_states, year): DATABASE_URL = "sqlite:///policy_data.db" @@ -211,7 +174,8 @@ def load_administrative_snap_data(df_states, year): value="0", ), ] - # No target at the national level is provided at this time. + # No target at the national level is provided at this time. Keeping it + # so that the state strata can have a parent stratum session.add(nat_stratum) session.flush() @@ -266,40 +230,7 @@ def load_administrative_snap_data(df_states, year): return stratum_lookup -# Survey data ------------------------------------------------------ - -def extract_survey_snap_data(year): - - raw_dfs = {} - for geo in ["District", "State", "National"]: - df = pull_acs_table("S2201", geo, year) - raw_dfs[geo] = df - - return raw_dfs - - -def transform_survey_snap_data(raw_dfs): - - dfs = {} - for geo in raw_dfs.keys(): - df = raw_dfs[geo] - dfs[geo] = df_data = df[["GEO_ID", "S2201_C03_001E"]].rename({ - "GEO_ID": "ucgid_str", - "S2201_C03_001E": "snap_household_ct" - }, axis=1 - )[ - ~df["GEO_ID"].isin( - [ # Puerto Rico's state and district - "0400000US72", - "5001800US7298", - ] - ) - ].copy() - - return dfs - - -def load_survey_snap_data(survey_dfs, year, stratum_lookup ={}): +def load_survey_snap_data(survey_df, year, stratum_lookup ={}): """Use an already defined stratum_lookup to load the survey SNAP data""" DATABASE_URL = "sqlite:///policy_data.db" @@ -308,43 +239,8 @@ def load_survey_snap_data(survey_dfs, year, stratum_lookup ={}): Session = sessionmaker(bind=engine) session = Session() - # National. Use the stratum from the administrative data function - nat_df = survey_dfs["National"] - nat_stratum = session.get(Stratum, stratum_lookup["National"]) - - nat_stratum.targets_rel.append( - Target( - variable="household_count", - period=year, - value=nat_df["snap_household_ct"], - source_id=4, - active=True, - ) - ) - session.add(nat_stratum) - session.flush() - - # Skipping state for now, but - # # State. Also use the stratum from the administrative data function - # state_df = survey_dfs["State"] - # for _, row in state_df.iterrows(): - # print(row) - # state_stratum = session.get(Stratum, stratum_lookup["State"][row["ucgid_str"]]) - - # state_stratum.targets_rel.append( - # Target( - # variable="household_count", - # period=year, - # value=row["snap_household_ct"], - # source_id=4, - # active=True, - # ) - # ) - # session.add(state_stratum) - # session.flush() - - # You will need to create new strata for districts - district_df = survey_dfs["District"] + # Create new strata for districts whose households recieve SNAP benefits + district_df = survey_df.copy() for _, row in district_df.iterrows(): note = f"Geo: {row['ucgid_str']} Received SNAP Benefits" state_ucgid_str = '0400000US' + row['ucgid_str'][9:11] @@ -387,15 +283,15 @@ def main(): # Extract --------- zip_file_admin = extract_administrative_snap_data() - raw_survey_dfs = extract_survey_snap_data(year) + raw_survey_df = extract_survey_snap_data(year) # Transform ------- state_admin_df = transform_administrative_snap_data(zip_file_admin, year) - survey_dfs = transform_survey_snap_data(raw_survey_dfs) + district_survey_df = transform_survey_snap_data(raw_survey_df) # Load ----------- stratum_lookup = load_administrative_snap_data(state_admin_df, year) - load_survey_snap_data(survey_dfs, year, stratum_lookup) + load_survey_snap_data(district_survey_df, year, stratum_lookup) if __name__ == "__main__": diff --git a/policyengine_us_data/db/temp.py b/policyengine_us_data/db/temp.py new file mode 100644 index 00000000..6026ace0 --- /dev/null +++ b/policyengine_us_data/db/temp.py @@ -0,0 +1,57 @@ +# ucgid_str converts the household’s ucgid enumeration into a comma‑separated string of all hierarchical UCGID codes. +from policyengine_us import Simulation +from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID + +# Minimal one-household simulation +sim = Simulation( + situation={ + "people": {"p1": {}}, + "households": {"h1": {"members": ["p1"]}}, + } +) + +# Assign a specific UCGID (California district 23) +sim.set_input("ucgid", 2024, UCGID.CA_23) + +# Use the ucgid_str Variable's formula +ucgid_str_val = sim.calculate("ucgid_str", 2024) +print(ucgid_str_val) +# ['5001800US0623,0400000US06,0100000US'] + + +# First, let's explore UCGID, the enum, and how it can create the hierarchy + +import pandas as pd +from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID + +rows = [] +for node in UCGID: + codes = node.get_hierarchical_codes() + rows.append({ + "name": node.name, + "code": codes[0], + "parent": codes[1] if len(codes) > 1 else None + }) + +hierarchy_df = ( + pd.DataFrame(rows) + .sort_values(["parent", "code"], na_position="first") + .reset_index(drop=True) +) + +print(hierarchy_df) +#Out[262]: +# name code parent +#0 US 0100000US None +#1 AL 0400000US01 0100000US +#2 AK 0400000US02 0100000US +#3 AZ 0400000US04 0100000US +#4 AR 0400000US05 0100000US +#.. ... ... ... +#483 WI_05 5001800US5505 0400000US55 +#484 WI_06 5001800US5506 0400000US55 +#485 WI_07 5001800US5507 0400000US55 +#486 WI_08 5001800US5508 0400000US55 +#487 WY_01 5001800US5600 0400000US56 +# +#[488 rows x 3 columns] diff --git a/policyengine_us_data/storage/calibration_targets/make_district_mapping.py b/policyengine_us_data/storage/calibration_targets/make_district_mapping.py new file mode 100644 index 00000000..cc3f50fb --- /dev/null +++ b/policyengine_us_data/storage/calibration_targets/make_district_mapping.py @@ -0,0 +1,254 @@ +""" +This was built before finding out about the crosswalk provided by the +Missouri Census Data Center (MCDC) at the University of Missouri. This crosswalk can be +accessed at (https://mcdc.missouri.edu/applications/geocorr.html) and would be a logical place +to transition to, though since this is already built and new IRS SOI files may be available soon, +it may not be worth the effort to transition. + +To see the definitive "before and after" of congressional redistricting following the 2020 census, +you should compare the block-level data from the 116th Congress to the 119th Congress. + +This approach is necessary for states whose initial redistricting maps were altered due to legal +challenges and is aligned with the mapping files provided by the U.S. Census Bureau. + +- **116th Congress (The "Before"):** This session (2019-2021) used the congressional maps +based on the 2010 census data. It serves as the stable pre-redistricting baseline, as these +maps were identical to those used by the 117th Congress. The Census Bureau's most recent files +for that decade correspond to the 116th Congress. + +- **118th Congress (The "Interim" Stage):** In several states, the initial congressional maps drawn +for the 2022 elections were successfully challenged and invalidated by courts (e.g., for reasons of +partisan or racial gerrymandering). This required the use of temporary, court-ordered, or remedial +maps for the 2022 elections. Consequently, the 118th Congress (2023-2025) in these states represents +an interim stage, not the final outcome of the redistricting cycle. + +- **119th Congress (The Definitive "After"):** Following these legal resolutions, new and more permanent +congressional maps were enacted ahead of the 2024 election cycle. The elections in November 2024 were +the first to use these new maps. Therefore, the 119th Congress (2025-2027) is the first to reflect the +final, settled mapping decisions based on the 2020 census data. + +By comparing the 116th and 119th Congresses, you bypass the anomalous, non-final maps of the 118th Congress, +providing a clear analysis of the redistricting cycle's ultimate impact. +""" + +import requests +import zipfile +import io +from pathlib import Path + +import pandas as pd +import numpy as np +import us + +from policyengine_us_data.storage import STORAGE_FOLDER, CALIBRATION_FOLDER + + +def fetch_block_to_district_map(congress: int) -> pd.DataFrame: + """ + Fetches the Census Block Equivalency File (BEF) for a given Congress. + + This file maps every 2020 census block (GEOID) to its corresponding + congressional district. + + Args: + congress: The congressional session number (e.g., 118 or 119). + + Returns: + A DataFrame with columns ['GEOID', f'CD{congress}']. + """ + if congress == 116: + url = "https://www2.census.gov/programs-surveys/decennial/rdo/mapping-files/2019/116-congressional-district-bef/cd116.zip" + zbytes = requests.get(url, timeout=120).content + + with zipfile.ZipFile(io.BytesIO(zbytes)) as z: + fname = "National_CD116.txt" + bef = pd.read_csv(z.open(fname), dtype=str) + bef.columns = bef.columns.str.strip() + bef = bef.rename(columns={"BLOCKID": "GEOID"}) + return bef[["GEOID", f"CD{congress}"]] + + elif congress == 118: + url = "https://www2.census.gov/programs-surveys/decennial/rdo/mapping-files/2023/118-congressional-district-bef/cd118.zip" + zbytes = requests.get(url, timeout=120).content + + with zipfile.ZipFile(io.BytesIO(zbytes)) as z: + fname = "National_CD118.txt" + bef = pd.read_csv(z.open(fname), dtype=str) + bef.columns = bef.columns.str.strip() + district_col = [c for c in bef.columns if c != "GEOID"][0] + bef = bef.rename(columns={district_col: f"CD{congress}"}) + return bef[["GEOID", f"CD{congress}"]] + + elif congress == 119: + url = "https://www2.census.gov/programs-surveys/decennial/rdo/mapping-files/2025/119-congressional-district-befs/cd119.zip" + zbytes = requests.get(url, timeout=120).content + + with zipfile.ZipFile(io.BytesIO(zbytes)) as z: + fname = "NationalCD119.txt" + bef = pd.read_csv(z.open(fname), sep=",", dtype=str) + bef.columns = bef.columns.str.strip() + bef = bef.rename(columns={"CDFP": f"CD{congress}"}) + return bef[["GEOID", f"CD{congress}"]] + + else: + raise ValueError( + f"Congress {congress} is not supported by this function." + ) + + +def fetch_block_population(state) -> pd.DataFrame: + """ + Download & parse the 2020 PL-94-171 “legacy” files for one state. + + Parameters + ---------- + state : str + Two-letter state/territory postal code **or** full state name + (e.g., "GA", "Georgia", "PR", "Puerto Rico"). + + Returns + ------- + pandas.DataFrame with columns GEOID (15-digit block code) and POP20. + """ + BASE = ( + "https://www2.census.gov/programs-surveys/decennial/2020/data/" + "01-Redistricting_File--PL_94-171/{dir}/{abbr}2020.pl.zip" + ) + st = us.states.lookup(state) + if st is None: + raise ValueError(f"Unrecognised state name/abbr: {state}") + + # Build URL components ----------------------------------------------------- + dir_name = st.name.replace(" ", "_") + abbr = st.abbr.lower() + url = BASE.format(dir=dir_name, abbr=abbr) + + # Download and open the zip ------------------------------------------------ + zbytes = requests.get(url, timeout=120).content + with zipfile.ZipFile(io.BytesIO(zbytes)) as z: + raw = z.read(f"{abbr}geo2020.pl") + try: + geo_lines = raw.decode("utf-8").splitlines() + except UnicodeDecodeError: + geo_lines = raw.decode("latin-1").splitlines() + + p1_lines = z.read(f"{abbr}000012020.pl").decode("utf-8").splitlines() + + # ---------------- GEO file: keep blocks (SUMLEV 750) ---------------------- + geo_records = [ + (parts[7], parts[8][-15:]) # LOGRECNO, 15-digit block GEOID + for ln in geo_lines + if (parts := ln.split("|"))[2] == "750" # summary level 750 = blocks + ] + geo_df = pd.DataFrame(geo_records, columns=["LOGRECNO", "GEOID"]) + + # ---------------- P-file: pull total-population cell ---------------------- + p1_records = [ + (p[4], int(p[5])) for p in map(lambda x: x.split("|"), p1_lines) + ] + p1_df = pd.DataFrame(p1_records, columns=["LOGRECNO", "P0010001"]) + + # ---------------- Merge & finish ----------------------------------------- + return ( + geo_df.merge(p1_df, on="LOGRECNO", how="left") + .assign(POP20=lambda d: d["P0010001"].fillna(0).astype(int)) + .loc[:, ["GEOID", "POP20"]] + .sort_values("GEOID") + .reset_index(drop=True) + ) + + +def build_crosswalk_cd116_to_cd119(): + """Builds the crosswalk between 116th and 119th congress""" + # Pull the census block level population data one state at a time + state_pops = [] + for s in us.states.STATES_AND_TERRITORIES: + if not s.is_territory and s.abbr not in ["DC", "ZZ"]: + print(s.name) + state_pops.append(fetch_block_population(s.abbr)) + block_pop_df = pd.concat(state_pops) + + # Get census blocks for each district under the 116th and 119th congress + # Remove 'ZZ': blocks not assigned to any congressional district + df116 = fetch_block_to_district_map(116) + df116 = df116.loc[df116["CD116"] != "ZZ"] + df119 = fetch_block_to_district_map(119) + df119 = df119.loc[df119["CD119"] != "ZZ"] + + common_blocks = df116.merge(df119, on="GEOID") + + block_stats = block_pop_df.merge(common_blocks, on="GEOID") + block_stats["state_fips"] = block_stats.GEOID.str[:2] + shares = ( + block_stats.groupby(["state_fips", "CD116", "CD119"])["POP20"] + .sum() + .rename("pop_shared") + .reset_index() + ) + + def make_cd_code(state, district): + return f"5001800US{str(state).zfill(2)}{str(district).zfill(2)}" + + shares["code_old"] = shares.apply( + lambda row: make_cd_code(row.state_fips, row.CD116), axis=1 + ) + shares["code_new"] = shares.apply( + lambda row: make_cd_code(row.state_fips, row.CD119), axis=1 + ) + shares["proportion"] = shares.groupby("code_old").pop_shared.transform( + lambda s: s / s.sum() + ) + + ## add DC's district + dc_row = pd.DataFrame( + { + "state_fips": ["11"], # DC's FIPS + "CD116": ["98"], # at-large code in the BEF files + "CD119": ["98"], + "pop_shared": [689545], + "code_old": ["5001800US1198"], + "code_new": ["5001800US1198"], + "proportion": [1.0], + } + ) + + shares = pd.concat([shares, dc_row], ignore_index=True) + + district_mapping = ( + shares[["code_old", "code_new", "proportion"]] + .sort_values(["code_old", "proportion"], ascending=[True, False]) + .reset_index(drop=True) + ) + assert len(set(district_mapping.code_old)) == 436 + assert len(set(district_mapping.code_new)) == 436 + mapping_path = Path(STORAGE_FOLDER, "district_mapping.csv") + district_mapping.to_csv(mapping_path, index=False) + + +def get_district_mapping(): + """Puts the 436 by 436 - with DC - (old by new) district mapping matrix into memory""" + + mapping_path = Path(STORAGE_FOLDER, "district_mapping.csv") + mapping_df = pd.read_csv(mapping_path) + + old_codes = sorted(mapping_df.code_old.unique()) + new_codes = sorted(mapping_df.code_new.unique()) + assert len(old_codes) == len(new_codes) == 436 + + old_index = {c: i for i, c in enumerate(old_codes)} + new_index = {c: j for j, c in enumerate(new_codes)} + + mapping_matrix = np.zeros((436, 436), dtype=float) + + for row in mapping_df.itertuples(index=False): + i = old_index[row.code_old] + j = new_index[row.code_new] + mapping_matrix[i, j] = row.proportion + + assert np.allclose(mapping_matrix.sum(axis=1), 1.0) + return {'mapping_matrix': mapping_matrix, 'old_codes': old_codes, 'new_codes': new_codes} + + +if __name__ == "__main__": + build_crosswalk_cd116_to_cd119() + print(get_district_mapping_matrix()) diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py index 69a475fb..018cb6a7 100644 --- a/policyengine_us_data/utils/census.py +++ b/policyengine_us_data/utils/census.py @@ -59,6 +59,29 @@ "Wyoming": "56", } +STATE_ABBREV_TO_FIPS = { + 'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06', + 'CO': '08', 'CT': '09', 'DE': '10', 'FL': '12', 'GA': '13', + 'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19', + 'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24', + 'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28', 'MO': '29', + 'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34', + 'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39', + 'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45', + 'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50', + 'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55', 'WY': '56', + 'DC': '11' +} + +TERRITORY_UCGIDS = { + "0400000US72", # Puerto Rico (state level) + "5001800US7298", # Puerto Rico + "5001800US6098", # American Samoa + "5001800US6698", # Guam + "5001800US6998", # Northern Mariana Islands + "5001800US7898", # U.S. Virgin Islands +} + def get_census_docs(year): docs_url = ( diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py new file mode 100644 index 00000000..bb484fab --- /dev/null +++ b/policyengine_us_data/utils/db.py @@ -0,0 +1,61 @@ +from typing import List, Optional + +from sqlmodel import Session, select +import sqlalchemy as sa + +from policyengine_us_data.db.create_database_tables import Stratum, StratumConstraint + + +def get_stratum_by_id(session: Session, stratum_id: int) -> Optional[Stratum]: + """Retrieves a single Stratum by its primary key""" + return session.get(Stratum, stratum_id) + + +def get_simple_stratum_by_ucgid(session: Session, ucgid: str) -> Optional[Stratum]: + """ + Finds a stratum defined *only* by a single ucgid_str constraint. + """ + constraint_count_subquery = ( + select( + StratumConstraint.stratum_id, + sa.func.count(StratumConstraint.stratum_id).label("constraint_count") + ) + .group_by(StratumConstraint.stratum_id) + .subquery() + ) + + statement = ( + select(Stratum) + .join(StratumConstraint) + .join( + constraint_count_subquery, + Stratum.stratum_id == constraint_count_subquery.c.stratum_id + ) + .where(StratumConstraint.constraint_variable == "ucgid_str") + .where(StratumConstraint.value == ucgid) + .where(constraint_count_subquery.c.constraint_count == 1) + ) + + return session.exec(statement).first() + + +def get_root_strata(session: Session) -> List[Stratum]: + """Finds all strata that do not have a parent""" + statement = select(Stratum).where(Stratum.parent_stratum_id == None) + return session.exec(statement).all() + + +def get_stratum_children(session: Session, stratum_id: int) -> List[Stratum]: + """Retrieves all direct children for a given stratum""" + parent_stratum = get_stratum_by_id(session, stratum_id) + if parent_stratum: + return parent_stratum.children_rel + return [] + + +def get_stratum_parent(session: Session, stratum_id: int) -> Optional[Stratum]: + """Retrieves the direct parent for a given stratum.""" + child_stratum = get_stratum_by_id(session, stratum_id) + if child_stratum: + return child_stratum.parent_rel + return None From dddf6891c61ee4fb4ccb84ef0e9f1d49636ebb3f Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 14 Aug 2025 18:57:31 -0400 Subject: [PATCH 13/27] linting --- .../db/create_initial_strata.py | 41 +- policyengine_us_data/db/etl_age.py | 10 +- policyengine_us_data/db/etl_irs_soi.py | 362 ++++++++++-------- policyengine_us_data/db/etl_medicaid.py | 61 ++- policyengine_us_data/db/etl_snap.py | 22 +- policyengine_us_data/db/temp.py | 50 +-- .../make_district_mapping.py | 6 +- policyengine_us_data/utils/census.py | 68 +++- policyengine_us_data/utils/db.py | 15 +- 9 files changed, 381 insertions(+), 254 deletions(-) diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py index 0a7e7f7a..a2a333df 100644 --- a/policyengine_us_data/db/create_initial_strata.py +++ b/policyengine_us_data/db/create_initial_strata.py @@ -6,31 +6,33 @@ from sqlmodel import SQLModel, Session, select -from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID +from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import ( + UCGID, +) from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, ) - def main(): - # Get the implied hierarchy by the UCGID enum -------- + # Get the implied hierarchy by the UCGID enum -------- rows = [] for node in UCGID: codes = node.get_hierarchical_codes() - rows.append({ - "name": node.name, - "code": codes[0], - "parent": codes[1] if len(codes) > 1 else None - }) - + rows.append( + { + "name": node.name, + "code": codes[0], + "parent": codes[1] if len(codes) > 1 else None, + } + ) + hierarchy_df = ( pd.DataFrame(rows) - .sort_values(["parent", "code"], na_position="first") - .reset_index(drop=True) + .sort_values(["parent", "code"], na_position="first") + .reset_index(drop=True) ) - DATABASE_URL = "sqlite:///policy_data.db" engine = create_engine(DATABASE_URL) @@ -40,11 +42,13 @@ def main(): # map the ucgid_str 'code' to auto-generated 'stratum_id' code_to_stratum_id: Dict[str, int] = {} - + for _, row in hierarchy_df.iterrows(): parent_code = row["parent"] - - parent_id = code_to_stratum_id.get(parent_code) if parent_code else None + + parent_id = ( + code_to_stratum_id.get(parent_code) if parent_code else None + ) new_stratum = Stratum( parent_stratum_id=parent_id, @@ -59,14 +63,15 @@ def main(): value=row["code"], ) ] - + session.add(new_stratum) - + session.flush() - + code_to_stratum_id[row["code"]] = new_stratum.stratum_id session.commit() + if __name__ == "__main__": main() diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index 084a43d6..7bb36ed4 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -78,11 +78,11 @@ def transform_age_data(age_data, docs): ) age_bounds = df_long["age_range"].str.split("-", expand=True).astype(int) age_bounds.columns = ["ge", "le"] - age_bounds[['gt']] = age_bounds[["ge"]] - 1 - age_bounds[['lt']] = age_bounds[["le"]] + 1 + age_bounds[["gt"]] = age_bounds[["ge"]] - 1 + age_bounds[["lt"]] = age_bounds[["le"]] + 1 df_long["age_greater_than"] = age_bounds[["gt"]] - df_long["age_less_than"] = age_bounds[["lt"]] + df_long["age_less_than"] = age_bounds[["lt"]] df_long["variable"] = "person_count" df_long["reform_id"] = 0 df_long["source_id"] = 1 @@ -208,6 +208,4 @@ def load_age_data(df_long, geo, year, stratum_lookup={}): state_strata_lku = load_age_data( long_state_df, "State", year, national_strata_lku ) - load_age_data( - long_district_df, "District", year, state_strata_lku - ) + load_age_data(long_district_df, "District", year, state_strata_lku) diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index c93eb593..a4a07cfe 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -10,9 +10,17 @@ StratumConstraint, Target, ) -from policyengine_us_data.utils.db import get_stratum_by_id, get_simple_stratum_by_ucgid, get_root_strata, get_stratum_children, get_stratum_parent +from policyengine_us_data.utils.db import ( + get_stratum_by_id, + get_simple_stratum_by_ucgid, + get_root_strata, + get_stratum_children, + get_stratum_parent, +) from policyengine_us_data.utils.census import TERRITORY_UCGIDS -from policyengine_us_data.storage.calibration_targets.make_district_mapping import get_district_mapping +from policyengine_us_data.storage.calibration_targets.make_district_mapping import ( + get_district_mapping, +) """See the 22incddocguide.docx manual from the IRS SOI""" @@ -22,7 +30,7 @@ AGI_STUB_TO_INCOME_RANGE = { 1: (-np.inf, 1), 2: (1 - epsilon, 10_000), - 3: (10_000 - epsilon , 25_000), + 3: (10_000 - epsilon, 25_000), 4: (25_000 - epsilon, 50_000), 5: (50_000 - epsilon, 75_000), 6: (75_000 - epsilon, 100_000), @@ -31,12 +39,13 @@ 9: (500_000 - epsilon, np.inf), } + def create_records(df, breakdown_variable, target_variable): """Transforms a DataFrame subset into a standardized list of records.""" temp_df = df[["ucgid_str"]].copy() - temp_df["breakdown_variable"] = breakdown_variable + temp_df["breakdown_variable"] = breakdown_variable temp_df["breakdown_value"] = df[breakdown_variable] - temp_df["target_variable"] = target_variable + temp_df["target_variable"] = target_variable temp_df["target_value"] = df[target_variable] return temp_df @@ -50,18 +59,15 @@ def make_records( breakdown_col: Optional[str] = None, multiplier: int = 1_000, ): - df = ( - df.rename({count_col: "tax_unit_count", - amount_col: amount_name}, - axis=1) - .copy() - ) + df = df.rename( + {count_col: "tax_unit_count", amount_col: amount_name}, axis=1 + ).copy() if breakdown_col is None: breakdown_col = "one" df[breakdown_col] = 1 - rec_counts = create_records(df, breakdown_col, "tax_unit_count") + rec_counts = create_records(df, breakdown_col, "tax_unit_count") rec_amounts = create_records(df, breakdown_col, amount_name) rec_amounts["target_value"] *= multiplier # Only the amounts get * 1000 rec_counts["target_variable"] = f"{amount_name}_tax_unit_count" @@ -72,57 +78,64 @@ def make_records( def make_agi_long(df: pd.DataFrame) -> pd.DataFrame: """Convert IRS SOI AGI‑split table from wide to the long format used""" target_col_map = { - "N1": "agi_tax_unit_count", - "N2": "agi_person_count", + "N1": "agi_tax_unit_count", + "N2": "agi_person_count", "A00100": "agi_total_amount", } - work = ( - df[["ucgid_str", "agi_stub"] + list(target_col_map)] - .rename(columns=target_col_map) + work = df[["ucgid_str", "agi_stub"] + list(target_col_map)].rename( + columns=target_col_map ) long = ( work.melt( id_vars=["ucgid_str", "agi_stub"], var_name="target_variable", - value_name="target_value" + value_name="target_value", ) .rename(columns={"agi_stub": "breakdown_value"}) .assign(breakdown_variable="agi_stub") ) - long = long[["ucgid_str", - "breakdown_variable", - "breakdown_value", - "target_variable", - "target_value"]] + long = long[ + [ + "ucgid_str", + "breakdown_variable", + "breakdown_value", + "target_variable", + "target_value", + ] + ] return [ - df.sort_values(by='ucgid_str').reset_index(drop=True) - for name, df in long.groupby(['breakdown_value', 'target_variable']) + df.sort_values(by="ucgid_str").reset_index(drop=True) + for name, df in long.groupby(["breakdown_value", "target_variable"]) ] def convert_district_data( input_df: pd.DataFrame, mapping_matrix: np.ndarray, # 436 x 436A - new_district_codes + new_district_codes, ) -> pd.DataFrame: """Transforms data from pre- to post- 2020 census districts""" df = input_df.copy() - old_districts_df = df[df['ucgid_str'].str.startswith("5001800US")].copy() - old_districts_df = old_districts_df.sort_values('ucgid_str').reset_index(drop=True) - old_values = old_districts_df['target_value'].to_numpy() + old_districts_df = df[df["ucgid_str"].str.startswith("5001800US")].copy() + old_districts_df = old_districts_df.sort_values("ucgid_str").reset_index( + drop=True + ) + old_values = old_districts_df["target_value"].to_numpy() new_values = mapping_matrix.T @ old_values # Create a new DataFrame for the transformed data, preserving the original schema. - new_districts_df = pd.DataFrame({ - 'ucgid_str': new_district_codes, - 'breakdown_variable': old_districts_df['breakdown_variable'], - 'breakdown_value': old_districts_df['breakdown_value'], - 'target_variable': old_districts_df['target_variable'], - 'target_value': new_values - }) + new_districts_df = pd.DataFrame( + { + "ucgid_str": new_district_codes, + "breakdown_variable": old_districts_df["breakdown_variable"], + "breakdown_value": old_districts_df["breakdown_value"], + "target_variable": old_districts_df["target_variable"], + "target_value": new_values, + } + ) - other_geos_df = df[~df['ucgid_str'].str.startswith("5001800US")].copy() + other_geos_df = df[~df["ucgid_str"].str.startswith("5001800US")].copy() final_df = pd.concat([other_geos_df, new_districts_df], ignore_index=True) @@ -153,88 +166,96 @@ def transform_soi_data(raw_df): dict(code="00400", name="tax_exempt_interest", breakdown=None), dict(code="00600", name="oridinary_dividends", breakdown=None), dict(code="00650", name="qualified_dividends", breakdown=None), - dict(code="26270", name="partnership_and_s_crop_net_income", breakdown=None), + dict( + code="26270", + name="partnership_and_s_crop_net_income", + breakdown=None, + ), dict(code="02500", name="total_social_security", breakdown=None), dict(code="01700", name="pension_and_annuities", breakdown=None), dict(code="02300", name="unemployment_compensation", breakdown=None), dict(code="00900", name="business_net_income", breakdown=None), - dict(code="17000", name="medical_and_dental_deduction", breakdown=None), + dict( + code="17000", name="medical_and_dental_deduction", breakdown=None + ), dict(code="00700", name="salt_refunds", breakdown=None), dict(code="18425", name="salt_amount", breakdown=None), dict(code="06500", name="income_tax", breakdown=None), ] # National --------------- - national_df = raw_df.copy().loc[ - (raw_df.STATE == "US") - ] + national_df = raw_df.copy().loc[(raw_df.STATE == "US")] national_df["ucgid_str"] = "0100000US" # State ------------------- # You've got agi_stub == 0 in here, which you want to use any time you don't want to # break things up by AGI state_df = raw_df.copy().loc[ - (raw_df.STATE != "US") & - (raw_df.CONG_DISTRICT == 0) + (raw_df.STATE != "US") & (raw_df.CONG_DISTRICT == 0) ] - state_df["ucgid_str"] = "0400000US" + state_df["STATEFIPS"].astype(str).str.zfill(2) + state_df["ucgid_str"] = "0400000US" + state_df["STATEFIPS"].astype( + str + ).str.zfill(2) # District ------------------ # This is going to fail because we're missing the single cong district states - district_df = raw_df.copy().loc[ - (raw_df.CONG_DISTRICT > 0) - ] + district_df = raw_df.copy().loc[(raw_df.CONG_DISTRICT > 0)] - max_cong_district_by_state = raw_df.groupby('STATE')['CONG_DISTRICT'].transform('max') + max_cong_district_by_state = raw_df.groupby("STATE")[ + "CONG_DISTRICT" + ].transform("max") district_df = raw_df.copy().loc[ - (raw_df['CONG_DISTRICT'] > 0) | (max_cong_district_by_state == 0) + (raw_df["CONG_DISTRICT"] > 0) | (max_cong_district_by_state == 0) ] - district_df = district_df.loc[district_df['STATE'] != 'US'] - district_df["STATEFIPS"] = district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2) + district_df = district_df.loc[district_df["STATE"] != "US"] + district_df["STATEFIPS"] = ( + district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2) + ) district_df["CONG_DISTRICT"] = ( district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2) ) - district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"] + district_df["ucgid_str"] = ( + "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"] + ) district_df = district_df[~district_df["ucgid_str"].isin(TERRITORY_UCGIDS)] assert district_df.shape[0] % 436 == 0 all_df = pd.concat([national_df, state_df, district_df]) - # "Marginal" over AGI bands, which this data set is organized according to + # "Marginal" over AGI bands, which this data set is organized according to all_marginals = all_df.copy().loc[all_df.agi_stub == 0] assert all_marginals.shape[0] == 436 + 51 + 1 # Collect targets from the SOI file records = [] for spec in TARGETS: - count_col = f"N{spec['code']}" # e.g. 'N59661' + count_col = f"N{spec['code']}" # e.g. 'N59661' amount_col = f"A{spec['code']}" # e.g. 'A59661' - + df = all_marginals.copy() - + if spec["breakdown"] is not None: col, val = spec["breakdown"] df[col] = val breakdown_col = col else: breakdown_col = None - + rec_counts, rec_amounts = make_records( df, - count_col = count_col, - amount_col = amount_col, - amount_name = spec["name"], - breakdown_col = breakdown_col, - multiplier = 1_000, + count_col=count_col, + amount_col=amount_col, + amount_name=spec["name"], + breakdown_col=breakdown_col, + multiplier=1_000, ) records.extend([rec_counts, rec_amounts]) - # AGI Processing (separate, doesn't have a count column) temp_df = df[["ucgid_str"]].copy() - temp_df["breakdown_variable"] = "one" - temp_df["breakdown_value"] = 1 + temp_df["breakdown_variable"] = "one" + temp_df["breakdown_value"] = 1 temp_df["target_variable"] = "agi" temp_df["target_value"] = df["A00100"] * 1_000 @@ -250,9 +271,14 @@ def transform_soi_data(raw_df): # Pre- to Post- 2020 Census redisticting mapping = get_district_mapping() - converted = [convert_district_data(r, mapping['mapping_matrix'], mapping['new_codes']) for r in records] + converted = [ + convert_district_data( + r, mapping["mapping_matrix"], mapping["new_codes"] + ) + for r in records + ] - return converted + return converted def load_soi_data(long_dfs, year): @@ -262,70 +288,74 @@ def load_soi_data(long_dfs, year): session = Session(engine) - # Load EITC data -------------------------------------------------------- + # Load EITC data -------------------------------------------------------- # Obviously this is not especially robust --- - eitc_data = {'0': (long_dfs[0], long_dfs[1]), - '1': (long_dfs[2], long_dfs[3]), - '2': (long_dfs[4], long_dfs[5]), - '3+': (long_dfs[6], long_dfs[7])} + eitc_data = { + "0": (long_dfs[0], long_dfs[1]), + "1": (long_dfs[2], long_dfs[3]), + "2": (long_dfs[4], long_dfs[5]), + "3+": (long_dfs[6], long_dfs[7]), + } stratum_lookup = {"State": {}, "District": {}} for n_children in eitc_data.keys(): eitc_count_i, eitc_amount_i = eitc_data[n_children] for i in range(eitc_count_i.shape[0]): - ucgid_i = eitc_count_i[['ucgid_str']].iloc[i].values[0] + ucgid_i = eitc_count_i[["ucgid_str"]].iloc[i].values[0] note = f"Geo: {ucgid_i}, EITC received with {n_children} children" if len(ucgid_i) == 9: # National. new_stratum = Stratum( parent_stratum_id=None, stratum_group_id=0, notes=note ) - elif len(ucgid_i) == 11: # State + elif len(ucgid_i) == 11: # State new_stratum = Stratum( parent_stratum_id=stratum_lookup["National"], stratum_group_id=0, - notes=note + notes=note, ) - elif len(ucgid_i) == 13: # District + elif len(ucgid_i) == 13: # District new_stratum = Stratum( - parent_stratum_id=stratum_lookup["State"]['0400000US' + ucgid_i[9:11]], + parent_stratum_id=stratum_lookup["State"][ + "0400000US" + ucgid_i[9:11] + ], stratum_group_id=0, - notes=note + notes=note, ) new_stratum.constraints_rel = [ - StratumConstraint( - constraint_variable="ucgid_str", - operation="in", - value=ucgid_i, - ), + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=ucgid_i, + ), ] if n_children == "3+": - new_stratum.constraints_rel.append( - StratumConstraint( - constraint_variable="eitc_children", - operation="greater_than", - value='2', - ) - ) + new_stratum.constraints_rel.append( + StratumConstraint( + constraint_variable="eitc_children", + operation="greater_than", + value="2", + ) + ) else: - new_stratum.constraints_rel.append( - StratumConstraint( - constraint_variable="eitc_children", - operation="equals", - value=f'{n_children}', - ) - ) + new_stratum.constraints_rel.append( + StratumConstraint( + constraint_variable="eitc_children", + operation="equals", + value=f"{n_children}", + ) + ) new_stratum.targets_rel = [ # It's already complex enough - #Target( + # Target( # variable="tax_unit_count", # period=year, # value=eitc_count_i.iloc[i][["target_value"]].values[0], # source_id=5, # active=True, - #), + # ), Target( variable="eitc", period=year, @@ -339,19 +369,21 @@ def load_soi_data(long_dfs, year): session.flush() if len(ucgid_i) == 9: - stratum_lookup["National"] = new_stratum.stratum_id - elif len(ucgid_i) == 11: - stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id + stratum_lookup["National"] = new_stratum.stratum_id + elif len(ucgid_i) == 11: + stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id session.commit() - # No breakdown variables in this set + # No breakdown variables in this set for j in range(8, 42, 2): - count_j, amount_j = long_dfs[j], long_dfs[j + 1] + count_j, amount_j = long_dfs[j], long_dfs[j + 1] amount_variable_name = amount_j.iloc[0][["target_variable"]].values[0] - print(f"Loading amount data for IRS SOI data on {amount_variable_name}") + print( + f"Loading amount data for IRS SOI data on {amount_variable_name}" + ) for i in range(count_j.shape[0]): - ucgid_i = count_j[['ucgid_str']].iloc[i].values[0] + ucgid_i = count_j[["ucgid_str"]].iloc[i].values[0] # Reusing an existing stratum this time, since there is no breakdown stratum = get_simple_stratum_by_ucgid(session, ucgid_i) @@ -361,13 +393,13 @@ def load_soi_data(long_dfs, year): # NOTE: If I do the counts, I'm going to need to explode the strata for the vars != 0 # OR, create new variables like qbid_tax_unit_count which requires adding stuff to -us # AND, it's already complex enough ----- - #Target( + # Target( # variable="tax_unit_count", # period=year, # value=count_j.iloc[i][["target_value"]].values[0], # source_id=5, # active=True, - #), + # ), Target( variable=amount_variable_name, period=year, @@ -382,11 +414,11 @@ def load_soi_data(long_dfs, year): session.commit() - # Adjusted Gross Income ------ + # Adjusted Gross Income ------ agi_values = long_dfs[42] for i in range(agi_values.shape[0]): - ucgid_i = agi_values[['ucgid_str']].iloc[i].values[0] + ucgid_i = agi_values[["ucgid_str"]].iloc[i].values[0] stratum = get_simple_stratum_by_ucgid(session, ucgid_i) stratum.targets_rel.append( Target( @@ -399,10 +431,14 @@ def load_soi_data(long_dfs, year): ) session.add(stratum) session.flush() - + session.commit() - agi_person_count_dfs = [df for df in long_dfs[43:] if df['target_variable'].iloc[0] == 'agi_person_count'] + agi_person_count_dfs = [ + df + for df in long_dfs[43:] + if df["target_variable"].iloc[0] == "agi_person_count" + ] for agi_df in agi_person_count_dfs: agi_stub = agi_df.iloc[0][["breakdown_value"]].values[0] @@ -413,62 +449,72 @@ def load_soi_data(long_dfs, year): nat_stratum = Stratum( parent_stratum_id=None, stratum_group_id=0, notes=note ) - nat_stratum.constraints_rel.extend([ - StratumConstraint( - constraint_variable="ucgid_str", - operation="in", - value=ucgid_i, - ), - StratumConstraint( - constraint_variable="agi", - operation="greater_than", - value=str(agi_income_lower), - ), - StratumConstraint( - constraint_variable="agi", - operation="less_than", - value=str(agi_income_upper), - ), - ]) + nat_stratum.constraints_rel.extend( + [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=ucgid_i, + ), + StratumConstraint( + constraint_variable="agi", + operation="greater_than", + value=str(agi_income_lower), + ), + StratumConstraint( + constraint_variable="agi", + operation="less_than", + value=str(agi_income_upper), + ), + ] + ) session.add(nat_stratum) session.flush() - - stratum_lookup = {"National": nat_stratum.stratum_id, "State": {}, "District": {}} + + stratum_lookup = { + "National": nat_stratum.stratum_id, + "State": {}, + "District": {}, + } for i in range(agi_df.shape[0]): - ucgid_i = agi_df[['ucgid_str']].iloc[i].values[0] + ucgid_i = agi_df[["ucgid_str"]].iloc[i].values[0] note = f"Geo: {ucgid_i}, AGI > {agi_income_lower}, AGI < {agi_income_upper}" person_count = agi_df.iloc[i][["target_value"]].values[0] - if len(ucgid_i) == 11: # State + if len(ucgid_i) == 11: # State new_stratum = Stratum( parent_stratum_id=stratum_lookup["National"], stratum_group_id=0, - notes=note + notes=note, ) - elif len(ucgid_i) == 13: # District + elif len(ucgid_i) == 13: # District new_stratum = Stratum( - parent_stratum_id=stratum_lookup["State"]['0400000US' + ucgid_i[9:11]], + parent_stratum_id=stratum_lookup["State"][ + "0400000US" + ucgid_i[9:11] + ], stratum_group_id=0, - notes=note + notes=note, ) - new_stratum.constraints_rel.extend([ - StratumConstraint( - constraint_variable="ucgid_str", - operation="in", - value=ucgid_i, - ), - StratumConstraint( - constraint_variable="agi", - operation="greater_than", - value=str(agi_income_lower), - ), - StratumConstraint( - constraint_variable="agi", - operation="less_than", - value=str(agi_income_upper), - ), - ]) + new_stratum.constraints_rel.extend( + [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=ucgid_i, + ), + StratumConstraint( + constraint_variable="agi", + operation="greater_than", + value=str(agi_income_lower), + ), + StratumConstraint( + constraint_variable="agi", + operation="less_than", + value=str(agi_income_upper), + ), + ] + ) new_stratum.targets_rel = [ Target( variable="person_count", @@ -483,9 +529,9 @@ def load_soi_data(long_dfs, year): session.flush() if len(ucgid_i) == 9: - stratum_lookup["National"] = new_stratum.stratum_id - elif len(ucgid_i) == 11: - stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id + stratum_lookup["National"] = new_stratum.stratum_id + elif len(ucgid_i) == 11: + stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id session.commit() diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py index ec16ac71..3a5ab7d7 100644 --- a/policyengine_us_data/db/etl_medicaid.py +++ b/policyengine_us_data/db/etl_medicaid.py @@ -28,13 +28,13 @@ def extract_medicaid_data(year): item = "6165f45b-ca93-5bb5-9d06-db29c692a360" response = requests.get( - f"https://data.medicaid.gov/api/1/metastore/schemas/dataset/items/{item}?show-reference-ids=false" + f"https://data.medicaid.gov/api/1/metastore/schemas/dataset/items/{item}?show-reference-ids=false" ) metadata = response.json() - - data_url = metadata['distribution'][0]['data']['downloadURL'] + + data_url = metadata["distribution"][0]["data"]["downloadURL"] state_admin_df = pd.read_csv(data_url) - + return cd_survey_df, state_admin_df @@ -43,27 +43,42 @@ def transform_medicaid_data(state_admin_df, cd_survey_df, year): reporting_period = year * 100 + 12 print(f"Reporting period is {reporting_period}") state_df = state_admin_df.loc[ - (state_admin_df["Reporting Period"] == reporting_period) & - (state_admin_df["Final Report"] == "Y"), - ["State Abbreviation", "Reporting Period", "Total Medicaid Enrollment"] + (state_admin_df["Reporting Period"] == reporting_period) + & (state_admin_df["Final Report"] == "Y"), + [ + "State Abbreviation", + "Reporting Period", + "Total Medicaid Enrollment", + ], ] state_df["FIPS"] = state_df["State Abbreviation"].map(STATE_ABBREV_TO_FIPS) - cd_df = cd_survey_df[["GEO_ID", "state", "congressional district", "S2704_C02_006E"]] + cd_df = cd_survey_df[ + ["GEO_ID", "state", "congressional district", "S2704_C02_006E"] + ] nc_cd_sum = cd_df.loc[cd_df.state == "37"].S2704_C02_006E.astype(int).sum() - nc_state_sum = state_df.loc[state_df.FIPS == '37']['Total Medicaid Enrollment'].values[0] - assert nc_cd_sum > .5 * nc_state_sum + nc_state_sum = state_df.loc[state_df.FIPS == "37"][ + "Total Medicaid Enrollment" + ].values[0] + assert nc_cd_sum > 0.5 * nc_state_sum assert nc_cd_sum <= nc_state_sum - state_df = state_df.rename(columns={'Total Medicaid Enrollment': 'medicaid_enrollment'}) - state_df['ucgid_str'] = '0400000US' + state_df['FIPS'].astype(str) + state_df = state_df.rename( + columns={"Total Medicaid Enrollment": "medicaid_enrollment"} + ) + state_df["ucgid_str"] = "0400000US" + state_df["FIPS"].astype(str) - cd_df = cd_df.rename(columns={'S2704_C02_006E': 'medicaid_enrollment', 'GEO_ID': 'ucgid_str'}) - cd_df = cd_df.loc[cd_df.state != '72'] + cd_df = cd_df.rename( + columns={ + "S2704_C02_006E": "medicaid_enrollment", + "GEO_ID": "ucgid_str", + } + ) + cd_df = cd_df.loc[cd_df.state != "72"] - out_cols = ['ucgid_str', 'medicaid_enrollment'] + out_cols = ["ucgid_str", "medicaid_enrollment"] return state_df[out_cols], cd_df[out_cols] @@ -80,7 +95,9 @@ def load_medicaid_data(long_state, long_cd, year): # National ---------------- nat_stratum = Stratum( - parent_stratum_id=None, stratum_group_id=0, notes="Geo: 0100000US Medicaid Enrolled" + parent_stratum_id=None, + stratum_group_id=0, + notes="Geo: 0100000US Medicaid Enrolled", ) nat_stratum.constraints_rel = [ StratumConstraint( @@ -101,7 +118,7 @@ def load_medicaid_data(long_state, long_cd, year): stratum_lookup["National"] = nat_stratum.stratum_id # State ------------------- - stratum_lookup["State"] = {} + stratum_lookup["State"] = {} for _, row in long_state.iterrows(): note = f"Geo: {row['ucgid_str']} Medicaid Enrolled" @@ -133,13 +150,15 @@ def load_medicaid_data(long_state, long_cd, year): ) session.add(new_stratum) session.flush() - stratum_lookup["State"][row['ucgid_str']] = new_stratum.stratum_id + stratum_lookup["State"][row["ucgid_str"]] = new_stratum.stratum_id # District ------------------- for _, row in long_cd.iterrows(): note = f"Geo: {row['ucgid_str']} Medicaid Enrolled" - parent_stratum_id = stratum_lookup["State"][f'0400000US{row["ucgid_str"][-4:-2]}'] + parent_stratum_id = stratum_lookup["State"][ + f'0400000US{row["ucgid_str"][-4:-2]}' + ] new_stratum = Stratum( parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note @@ -179,7 +198,9 @@ def load_medicaid_data(long_state, long_cd, year): cd_survey_df, state_admin_df = extract_medicaid_data(year) # Transform ------------------- - long_state, long_cd = transform_medicaid_data(state_admin_df, cd_survey_df, year) + long_state, long_cd = transform_medicaid_data( + state_admin_df, cd_survey_df, year + ) # Load ----------------------- load_medicaid_data(long_state, long_cd, year) diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index f9a172a9..fb110025 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -134,10 +134,8 @@ def transform_administrative_snap_data(zip_file, year): def transform_survey_snap_data(raw_df): df = raw_df.copy() - return df[["GEO_ID", "S2201_C03_001E"]].rename({ - "GEO_ID": "ucgid_str", - "S2201_C03_001E": "snap_household_ct" - }, axis=1 + return df[["GEO_ID", "S2201_C03_001E"]].rename( + {"GEO_ID": "ucgid_str", "S2201_C03_001E": "snap_household_ct"}, axis=1 )[ ~df["GEO_ID"].isin( [ # Puerto Rico's state and district @@ -160,7 +158,9 @@ def load_administrative_snap_data(df_states, year): # National ---------------- nat_stratum = Stratum( - parent_stratum_id=None, stratum_group_id=0, notes="Geo: 0100000US Received SNAP Benefits" + parent_stratum_id=None, + stratum_group_id=0, + notes="Geo: 0100000US Received SNAP Benefits", ) nat_stratum.constraints_rel = [ StratumConstraint( @@ -182,7 +182,7 @@ def load_administrative_snap_data(df_states, year): stratum_lookup["National"] = nat_stratum.stratum_id # State ------------------- - stratum_lookup["State"] = {} + stratum_lookup["State"] = {} for _, row in df_states.iterrows(): note = f"Geo: {row['ucgid_str']} Received SNAP Benefits" @@ -224,13 +224,13 @@ def load_administrative_snap_data(df_states, year): ) session.add(new_stratum) session.flush() - stratum_lookup["State"][row['ucgid_str']] = new_stratum.stratum_id + stratum_lookup["State"][row["ucgid_str"]] = new_stratum.stratum_id session.commit() return stratum_lookup -def load_survey_snap_data(survey_df, year, stratum_lookup ={}): +def load_survey_snap_data(survey_df, year, stratum_lookup={}): """Use an already defined stratum_lookup to load the survey SNAP data""" DATABASE_URL = "sqlite:///policy_data.db" @@ -243,8 +243,8 @@ def load_survey_snap_data(survey_df, year, stratum_lookup ={}): district_df = survey_df.copy() for _, row in district_df.iterrows(): note = f"Geo: {row['ucgid_str']} Received SNAP Benefits" - state_ucgid_str = '0400000US' + row['ucgid_str'][9:11] - state_stratum_id = stratum_lookup['State'][state_ucgid_str] + state_ucgid_str = "0400000US" + row["ucgid_str"][9:11] + state_stratum_id = stratum_lookup["State"][state_ucgid_str] new_stratum = Stratum( parent_stratum_id=state_stratum_id, stratum_group_id=0, notes=note ) @@ -258,7 +258,7 @@ def load_survey_snap_data(survey_df, year, stratum_lookup ={}): StratumConstraint( constraint_variable="snap", operation="greater_than", - value='0', + value="0", ), ] new_stratum.targets_rel.append( diff --git a/policyengine_us_data/db/temp.py b/policyengine_us_data/db/temp.py index 6026ace0..0c687ff3 100644 --- a/policyengine_us_data/db/temp.py +++ b/policyengine_us_data/db/temp.py @@ -1,6 +1,8 @@ # ucgid_str converts the household’s ucgid enumeration into a comma‑separated string of all hierarchical UCGID codes. from policyengine_us import Simulation -from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID +from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import ( + UCGID, +) # Minimal one-household simulation sim = Simulation( @@ -22,36 +24,40 @@ # First, let's explore UCGID, the enum, and how it can create the hierarchy import pandas as pd -from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID +from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import ( + UCGID, +) rows = [] for node in UCGID: codes = node.get_hierarchical_codes() - rows.append({ - "name": node.name, - "code": codes[0], - "parent": codes[1] if len(codes) > 1 else None - }) + rows.append( + { + "name": node.name, + "code": codes[0], + "parent": codes[1] if len(codes) > 1 else None, + } + ) hierarchy_df = ( pd.DataFrame(rows) - .sort_values(["parent", "code"], na_position="first") - .reset_index(drop=True) + .sort_values(["parent", "code"], na_position="first") + .reset_index(drop=True) ) print(hierarchy_df) -#Out[262]: +# Out[262]: # name code parent -#0 US 0100000US None -#1 AL 0400000US01 0100000US -#2 AK 0400000US02 0100000US -#3 AZ 0400000US04 0100000US -#4 AR 0400000US05 0100000US -#.. ... ... ... -#483 WI_05 5001800US5505 0400000US55 -#484 WI_06 5001800US5506 0400000US55 -#485 WI_07 5001800US5507 0400000US55 -#486 WI_08 5001800US5508 0400000US55 -#487 WY_01 5001800US5600 0400000US56 +# 0 US 0100000US None +# 1 AL 0400000US01 0100000US +# 2 AK 0400000US02 0100000US +# 3 AZ 0400000US04 0100000US +# 4 AR 0400000US05 0100000US +# .. ... ... ... +# 483 WI_05 5001800US5505 0400000US55 +# 484 WI_06 5001800US5506 0400000US55 +# 485 WI_07 5001800US5507 0400000US55 +# 486 WI_08 5001800US5508 0400000US55 +# 487 WY_01 5001800US5600 0400000US56 # -#[488 rows x 3 columns] +# [488 rows x 3 columns] diff --git a/policyengine_us_data/storage/calibration_targets/make_district_mapping.py b/policyengine_us_data/storage/calibration_targets/make_district_mapping.py index cc3f50fb..72ff8d88 100644 --- a/policyengine_us_data/storage/calibration_targets/make_district_mapping.py +++ b/policyengine_us_data/storage/calibration_targets/make_district_mapping.py @@ -246,7 +246,11 @@ def get_district_mapping(): mapping_matrix[i, j] = row.proportion assert np.allclose(mapping_matrix.sum(axis=1), 1.0) - return {'mapping_matrix': mapping_matrix, 'old_codes': old_codes, 'new_codes': new_codes} + return { + "mapping_matrix": mapping_matrix, + "old_codes": old_codes, + "new_codes": new_codes, + } if __name__ == "__main__": diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py index 018cb6a7..fb577e60 100644 --- a/policyengine_us_data/utils/census.py +++ b/policyengine_us_data/utils/census.py @@ -60,17 +60,57 @@ } STATE_ABBREV_TO_FIPS = { - 'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06', - 'CO': '08', 'CT': '09', 'DE': '10', 'FL': '12', 'GA': '13', - 'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19', - 'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24', - 'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28', 'MO': '29', - 'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34', - 'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39', - 'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45', - 'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50', - 'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55', 'WY': '56', - 'DC': '11' + "AL": "01", + "AK": "02", + "AZ": "04", + "AR": "05", + "CA": "06", + "CO": "08", + "CT": "09", + "DE": "10", + "FL": "12", + "GA": "13", + "HI": "15", + "ID": "16", + "IL": "17", + "IN": "18", + "IA": "19", + "KS": "20", + "KY": "21", + "LA": "22", + "ME": "23", + "MD": "24", + "MA": "25", + "MI": "26", + "MN": "27", + "MS": "28", + "MO": "29", + "MT": "30", + "NE": "31", + "NV": "32", + "NH": "33", + "NJ": "34", + "NM": "35", + "NY": "36", + "NC": "37", + "ND": "38", + "OH": "39", + "OK": "40", + "OR": "41", + "PA": "42", + "RI": "44", + "SC": "45", + "SD": "46", + "TN": "47", + "TX": "48", + "UT": "49", + "VT": "50", + "VA": "51", + "WA": "53", + "WV": "54", + "WI": "55", + "WY": "56", + "DC": "11", } TERRITORY_UCGIDS = { @@ -103,9 +143,9 @@ def pull_acs_table(group: str, geo: str, year: int) -> pd.DataFrame: "year": e.g., 2023 """ base = f"https://api.census.gov/data/{year}/acs/acs1" - - if group[0] == 'S': - base = base + "/subject" + + if group[0] == "S": + base = base + "/subject" geo_q = { "National": "us:*", "State": "state:*", diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py index bb484fab..a8081db4 100644 --- a/policyengine_us_data/utils/db.py +++ b/policyengine_us_data/utils/db.py @@ -3,7 +3,10 @@ from sqlmodel import Session, select import sqlalchemy as sa -from policyengine_us_data.db.create_database_tables import Stratum, StratumConstraint +from policyengine_us_data.db.create_database_tables import ( + Stratum, + StratumConstraint, +) def get_stratum_by_id(session: Session, stratum_id: int) -> Optional[Stratum]: @@ -11,14 +14,18 @@ def get_stratum_by_id(session: Session, stratum_id: int) -> Optional[Stratum]: return session.get(Stratum, stratum_id) -def get_simple_stratum_by_ucgid(session: Session, ucgid: str) -> Optional[Stratum]: +def get_simple_stratum_by_ucgid( + session: Session, ucgid: str +) -> Optional[Stratum]: """ Finds a stratum defined *only* by a single ucgid_str constraint. """ constraint_count_subquery = ( select( StratumConstraint.stratum_id, - sa.func.count(StratumConstraint.stratum_id).label("constraint_count") + sa.func.count(StratumConstraint.stratum_id).label( + "constraint_count" + ), ) .group_by(StratumConstraint.stratum_id) .subquery() @@ -29,7 +36,7 @@ def get_simple_stratum_by_ucgid(session: Session, ucgid: str) -> Optional[Stratu .join(StratumConstraint) .join( constraint_count_subquery, - Stratum.stratum_id == constraint_count_subquery.c.stratum_id + Stratum.stratum_id == constraint_count_subquery.c.stratum_id, ) .where(StratumConstraint.constraint_variable == "ucgid_str") .where(StratumConstraint.value == ucgid) From 9c3a460246d697f2c10b7f7bffc8ca5fde579ed7 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 14 Aug 2025 22:23:04 -0400 Subject: [PATCH 14/27] fixed national stratum in agi script --- policyengine_us_data/db/etl_irs_soi.py | 7 +-- policyengine_us_data/db/temp.py | 63 -------------------------- 2 files changed, 4 insertions(+), 66 deletions(-) delete mode 100644 policyengine_us_data/db/temp.py diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index a4a07cfe..5e28e464 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -446,6 +446,7 @@ def load_soi_data(long_dfs, year): # Make a National Stratum for each AGI Stub, even though there's no national target # There no national target because the data set only has agi_stub = 0 for national + note = f"Geo: 0100000US, AGI > {agi_income_lower}, AGI < {agi_income_upper}" nat_stratum = Stratum( parent_stratum_id=None, stratum_group_id=0, notes=note ) @@ -454,7 +455,7 @@ def load_soi_data(long_dfs, year): StratumConstraint( constraint_variable="ucgid_str", operation="in", - value=ucgid_i, + value="0100000US", ), StratumConstraint( constraint_variable="agi", @@ -515,7 +516,7 @@ def load_soi_data(long_dfs, year): ), ] ) - new_stratum.targets_rel = [ + new_stratum.targets_rel.append( Target( variable="person_count", period=year, @@ -523,7 +524,7 @@ def load_soi_data(long_dfs, year): source_id=5, active=True, ) - ] + ) session.add(new_stratum) session.flush() diff --git a/policyengine_us_data/db/temp.py b/policyengine_us_data/db/temp.py deleted file mode 100644 index 0c687ff3..00000000 --- a/policyengine_us_data/db/temp.py +++ /dev/null @@ -1,63 +0,0 @@ -# ucgid_str converts the household’s ucgid enumeration into a comma‑separated string of all hierarchical UCGID codes. -from policyengine_us import Simulation -from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import ( - UCGID, -) - -# Minimal one-household simulation -sim = Simulation( - situation={ - "people": {"p1": {}}, - "households": {"h1": {"members": ["p1"]}}, - } -) - -# Assign a specific UCGID (California district 23) -sim.set_input("ucgid", 2024, UCGID.CA_23) - -# Use the ucgid_str Variable's formula -ucgid_str_val = sim.calculate("ucgid_str", 2024) -print(ucgid_str_val) -# ['5001800US0623,0400000US06,0100000US'] - - -# First, let's explore UCGID, the enum, and how it can create the hierarchy - -import pandas as pd -from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import ( - UCGID, -) - -rows = [] -for node in UCGID: - codes = node.get_hierarchical_codes() - rows.append( - { - "name": node.name, - "code": codes[0], - "parent": codes[1] if len(codes) > 1 else None, - } - ) - -hierarchy_df = ( - pd.DataFrame(rows) - .sort_values(["parent", "code"], na_position="first") - .reset_index(drop=True) -) - -print(hierarchy_df) -# Out[262]: -# name code parent -# 0 US 0100000US None -# 1 AL 0400000US01 0100000US -# 2 AK 0400000US02 0100000US -# 3 AZ 0400000US04 0100000US -# 4 AR 0400000US05 0100000US -# .. ... ... ... -# 483 WI_05 5001800US5505 0400000US55 -# 484 WI_06 5001800US5506 0400000US55 -# 485 WI_07 5001800US5507 0400000US55 -# 486 WI_08 5001800US5508 0400000US55 -# 487 WY_01 5001800US5600 0400000US56 -# -# [488 rows x 3 columns] From 81e2011826e3733bafdddfbf79338931836b208e Mon Sep 17 00:00:00 2001 From: Ben Ogorek Date: Fri, 15 Aug 2025 08:43:33 -0400 Subject: [PATCH 15/27] refactor: use sqlmodel session --- .../db/create_initial_strata.py | 50 ++--- policyengine_us_data/db/etl_age.py | 122 +++++------ policyengine_us_data/db/etl_medicaid.py | 168 +++++++-------- policyengine_us_data/db/etl_snap.py | 203 +++++++++--------- 4 files changed, 266 insertions(+), 277 deletions(-) diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py index a2a333df..068bca30 100644 --- a/policyengine_us_data/db/create_initial_strata.py +++ b/policyengine_us_data/db/create_initial_strata.py @@ -1,9 +1,7 @@ from typing import Dict import pandas as pd -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker -from sqlmodel import SQLModel, Session, select +from sqlmodel import Session, create_engine from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import ( @@ -37,40 +35,38 @@ def main(): DATABASE_URL = "sqlite:///policy_data.db" engine = create_engine(DATABASE_URL) - Session = sessionmaker(bind=engine) - session = Session() - # map the ucgid_str 'code' to auto-generated 'stratum_id' code_to_stratum_id: Dict[str, int] = {} - for _, row in hierarchy_df.iterrows(): - parent_code = row["parent"] - - parent_id = ( - code_to_stratum_id.get(parent_code) if parent_code else None - ) + with Session(engine) as session: + for _, row in hierarchy_df.iterrows(): + parent_code = row["parent"] - new_stratum = Stratum( - parent_stratum_id=parent_id, - notes=f'{row["name"]} (ucgid {row["code"]})', - stratum_group_id=1, - ) + parent_id = ( + code_to_stratum_id.get(parent_code) if parent_code else None + ) - new_stratum.constraints_rel = [ - StratumConstraint( - constraint_variable="ucgid_str", - operation="in", - value=row["code"], + new_stratum = Stratum( + parent_stratum_id=parent_id, + notes=f'{row["name"]} (ucgid {row["code"]})', + stratum_group_id=1, ) - ] - session.add(new_stratum) + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=row["code"], + ) + ] + + session.add(new_stratum) - session.flush() + session.flush() - code_to_stratum_id[row["code"]] = new_stratum.stratum_id + code_to_stratum_id[row["code"]] = new_stratum.stratum_id - session.commit() + session.commit() if __name__ == "__main__": diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index 7bb36ed4..bc540373 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -1,11 +1,6 @@ -import requests -from pathlib import Path -import io - import pandas as pd import numpy as np -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker +from sqlmodel import Session, create_engine from policyengine_us_data.db.create_database_tables import ( Stratum, @@ -95,7 +90,7 @@ def get_parent_geo(geo): return {"National": None, "State": "National", "District": "State"}[geo] -def load_age_data(df_long, geo, year, stratum_lookup={}): +def load_age_data(df_long, geo, year, stratum_lookup=None): # Quick data quality check before loading ---- if geo == "National": @@ -111,78 +106,77 @@ def load_age_data(df_long, geo, year, stratum_lookup={}): DATABASE_URL = "sqlite:///policy_data.db" engine = create_engine(DATABASE_URL) - Session = sessionmaker(bind=engine) - session = Session() - - if not stratum_lookup: + if stratum_lookup is None: if geo != "National": raise ValueError("Include stratum_lookup unless National geo") stratum_lookup = {"National": {}} else: stratum_lookup[geo] = {} - for _, row in df_long.iterrows(): - - # Create the parent Stratum object. - # We will attach children to it before adding it to the session. - note = f"Age: {row['age_range']}, Geo: {row['ucgid_str']}" - parent_geo = get_parent_geo(geo) - parent_stratum_id = ( - stratum_lookup[parent_geo][row["age_range"]] - if parent_geo - else None - ) - - new_stratum = Stratum( - parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note - ) - - # Create constraints and link them to the parent's relationship attribute. - # TODO: greater_than_or_equal_to to just greater than! - new_stratum.constraints_rel = [ - StratumConstraint( - constraint_variable="ucgid_str", - operation="in", - value=row["ucgid_str"], - ), - StratumConstraint( - constraint_variable="age", - operation="greater_than", - value=str(row["age_greater_than"]), - ), - ] + with Session(engine) as session: + for _, row in df_long.iterrows(): + # Create the parent Stratum object. + # We will attach children to it before adding it to the session. + note = f"Age: {row['age_range']}, Geo: {row['ucgid_str']}" + parent_geo = get_parent_geo(geo) + parent_stratum_id = ( + stratum_lookup[parent_geo][row["age_range"]] + if parent_geo + else None + ) - age_lt_value = row["age_less_than"] - if not np.isinf(age_lt_value): - new_stratum.constraints_rel.append( + new_stratum = Stratum( + parent_stratum_id=parent_stratum_id, + stratum_group_id=0, + notes=note, + ) + + # Create constraints and link them to the parent's relationship attribute. + # TODO: greater_than_or_equal_to to just greater than! + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=row["ucgid_str"], + ), StratumConstraint( constraint_variable="age", - operation="less_than", - value=str(row["age_less_than"]), + operation="greater_than", + value=str(row["age_greater_than"]), + ), + ] + + age_lt_value = row["age_less_than"] + if not np.isinf(age_lt_value): + new_stratum.constraints_rel.append( + StratumConstraint( + constraint_variable="age", + operation="less_than", + value=str(row["age_less_than"]), + ) ) - ) - # Create the Target and link it to the parent. - new_stratum.targets_rel.append( - Target( - variable=row["variable"], - period=year, - value=row["value"], - source_id=row["source_id"], - active=row["active"], + # Create the Target and link it to the parent. + new_stratum.targets_rel.append( + Target( + variable=row["variable"], + period=year, + value=row["value"], + source_id=row["source_id"], + active=row["active"], + ) ) - ) - # Add ONLY the parent object to the session. - # The 'cascade' setting will handle the children automatically. - session.add(new_stratum) + # Add ONLY the parent object to the session. + # The 'cascade' setting will handle the children automatically. + session.add(new_stratum) - # Flush to get the id - session.flush() - stratum_lookup[geo][row["age_range"]] = new_stratum.stratum_id + # Flush to get the id + session.flush() + stratum_lookup[geo][row["age_range"]] = new_stratum.stratum_id - # Commit all the new objects at once. - session.commit() + # Commit all the new objects at once. + session.commit() return stratum_lookup diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py index 3a5ab7d7..4ff96278 100644 --- a/policyengine_us_data/db/etl_medicaid.py +++ b/policyengine_us_data/db/etl_medicaid.py @@ -1,8 +1,7 @@ import requests import pandas as pd -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker +from sqlmodel import Session, create_engine from policyengine_us_data.db.create_database_tables import ( Stratum, @@ -86,52 +85,21 @@ def load_medicaid_data(long_state, long_cd, year): DATABASE_URL = "sqlite:///policy_data.db" engine = create_engine(DATABASE_URL) - year = 2023 - - Session = sessionmaker(bind=engine) - session = Session() stratum_lookup = {} - # National ---------------- - nat_stratum = Stratum( - parent_stratum_id=None, - stratum_group_id=0, - notes="Geo: 0100000US Medicaid Enrolled", - ) - nat_stratum.constraints_rel = [ - StratumConstraint( - constraint_variable="ucgid_str", - operation="in", - value="0100000US", - ), - StratumConstraint( - constraint_variable="medicaid_enrolled", - operation="equals", - value="True", - ), - ] - # No target at the national level is provided at this time. - - session.add(nat_stratum) - session.flush() - stratum_lookup["National"] = nat_stratum.stratum_id - - # State ------------------- - stratum_lookup["State"] = {} - for _, row in long_state.iterrows(): - - note = f"Geo: {row['ucgid_str']} Medicaid Enrolled" - parent_stratum_id = nat_stratum.stratum_id - - new_stratum = Stratum( - parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note + with Session(engine) as session: + # National ---------------- + nat_stratum = Stratum( + parent_stratum_id=None, + stratum_group_id=0, + notes="Geo: 0100000US Medicaid Enrolled", ) - new_stratum.constraints_rel = [ + nat_stratum.constraints_rel = [ StratumConstraint( constraint_variable="ucgid_str", operation="in", - value=row["ucgid_str"], + value="0100000US", ), StratumConstraint( constraint_variable="medicaid_enrolled", @@ -139,55 +107,87 @@ def load_medicaid_data(long_state, long_cd, year): value="True", ), ] - new_stratum.targets_rel.append( - Target( - variable="person_count", - period=year, - value=row["medicaid_enrollment"], - source_id=2, - active=True, - ) - ) - session.add(new_stratum) + # No target at the national level is provided at this time. + + session.add(nat_stratum) session.flush() - stratum_lookup["State"][row["ucgid_str"]] = new_stratum.stratum_id + stratum_lookup["National"] = nat_stratum.stratum_id - # District ------------------- - for _, row in long_cd.iterrows(): + # State ------------------- + stratum_lookup["State"] = {} + for _, row in long_state.iterrows(): - note = f"Geo: {row['ucgid_str']} Medicaid Enrolled" - parent_stratum_id = stratum_lookup["State"][ - f'0400000US{row["ucgid_str"][-4:-2]}' - ] + note = f"Geo: {row['ucgid_str']} Medicaid Enrolled" + parent_stratum_id = nat_stratum.stratum_id - new_stratum = Stratum( - parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note - ) - new_stratum.constraints_rel = [ - StratumConstraint( - constraint_variable="ucgid_str", - operation="in", - value=row["ucgid_str"], - ), - StratumConstraint( - constraint_variable="medicaid_enrolled", - operation="equals", - value="True", - ), - ] - new_stratum.targets_rel.append( - Target( - variable="person_count", - period=year, - value=row["medicaid_enrollment"], - source_id=2, - active=True, + new_stratum = Stratum( + parent_stratum_id=parent_stratum_id, + stratum_group_id=0, + notes=note, ) - ) - session.add(new_stratum) - session.flush() + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=row["ucgid_str"], + ), + StratumConstraint( + constraint_variable="medicaid_enrolled", + operation="equals", + value="True", + ), + ] + new_stratum.targets_rel.append( + Target( + variable="person_count", + period=year, + value=row["medicaid_enrollment"], + source_id=2, + active=True, + ) + ) + session.add(new_stratum) + session.flush() + stratum_lookup["State"][row["ucgid_str"]] = new_stratum.stratum_id + + # District ------------------- + for _, row in long_cd.iterrows(): + + note = f"Geo: {row['ucgid_str']} Medicaid Enrolled" + parent_stratum_id = stratum_lookup["State"][ + f'0400000US{row["ucgid_str"][-4:-2]}' + ] + + new_stratum = Stratum( + parent_stratum_id=parent_stratum_id, + stratum_group_id=0, + notes=note, + ) + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=row["ucgid_str"], + ), + StratumConstraint( + constraint_variable="medicaid_enrolled", + operation="equals", + value="True", + ), + ] + new_stratum.targets_rel.append( + Target( + variable="person_count", + period=year, + value=row["medicaid_enrollment"], + source_id=2, + active=True, + ) + ) + session.add(new_stratum) + session.flush() - session.commit() + session.commit() if __name__ == "__main__": diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index fb110025..a60c0074 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -1,15 +1,11 @@ import requests import zipfile import io -import os -import re -from pathlib import Path import pandas as pd import numpy as np import us -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker +from sqlmodel import Session, create_engine from policyengine_us_data.db.create_database_tables import ( Stratum, @@ -151,129 +147,132 @@ def load_administrative_snap_data(df_states, year): DATABASE_URL = "sqlite:///policy_data.db" engine = create_engine(DATABASE_URL) - Session = sessionmaker(bind=engine) - session = Session() - stratum_lookup = {} - # National ---------------- - nat_stratum = Stratum( - parent_stratum_id=None, - stratum_group_id=0, - notes="Geo: 0100000US Received SNAP Benefits", - ) - nat_stratum.constraints_rel = [ - StratumConstraint( - constraint_variable="ucgid_str", - operation="in", - value="0100000US", - ), - StratumConstraint( - constraint_variable="snap", - operation="is_greater_than", - value="0", - ), - ] - # No target at the national level is provided at this time. Keeping it - # so that the state strata can have a parent stratum - - session.add(nat_stratum) - session.flush() - stratum_lookup["National"] = nat_stratum.stratum_id - - # State ------------------- - stratum_lookup["State"] = {} - for _, row in df_states.iterrows(): - - note = f"Geo: {row['ucgid_str']} Received SNAP Benefits" - parent_stratum_id = nat_stratum.stratum_id - - new_stratum = Stratum( - parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note + with Session(engine) as session: + # National ---------------- + nat_stratum = Stratum( + parent_stratum_id=None, + stratum_group_id=0, + notes="Geo: 0100000US Received SNAP Benefits", ) - new_stratum.constraints_rel = [ + nat_stratum.constraints_rel = [ StratumConstraint( constraint_variable="ucgid_str", operation="in", - value=row["ucgid_str"], + value="0100000US", ), StratumConstraint( constraint_variable="snap", - operation="is_greater_than", + operation="greater_than", value="0", ), ] - # Two targets now. Same data source. Same stratum - new_stratum.targets_rel.append( - Target( - variable="household_count", - period=year, - value=row["Households"], - source_id=3, - active=True, + # No target at the national level is provided at this time. Keeping it + # so that the state strata can have a parent stratum + + session.add(nat_stratum) + session.flush() + stratum_lookup["National"] = nat_stratum.stratum_id + + # State ------------------- + stratum_lookup["State"] = {} + for _, row in df_states.iterrows(): + + note = f"Geo: {row['ucgid_str']} Received SNAP Benefits" + parent_stratum_id = nat_stratum.stratum_id + + new_stratum = Stratum( + parent_stratum_id=parent_stratum_id, + stratum_group_id=0, + notes=note, ) - ) - new_stratum.targets_rel.append( - Target( - variable="snap", - period=year, - value=row["Cost"], - source_id=3, - active=True, + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=row["ucgid_str"], + ), + StratumConstraint( + constraint_variable="snap", + operation="greater_than", + value="0", + ), + ] + # Two targets now. Same data source. Same stratum + new_stratum.targets_rel.append( + Target( + variable="household_count", + period=year, + value=row["Households"], + source_id=3, + active=True, + ) ) - ) - session.add(new_stratum) - session.flush() - stratum_lookup["State"][row["ucgid_str"]] = new_stratum.stratum_id + new_stratum.targets_rel.append( + Target( + variable="snap", + period=year, + value=row["Cost"], + source_id=3, + active=True, + ) + ) + session.add(new_stratum) + session.flush() + stratum_lookup["State"][row["ucgid_str"]] = new_stratum.stratum_id - session.commit() + session.commit() return stratum_lookup -def load_survey_snap_data(survey_df, year, stratum_lookup={}): +def load_survey_snap_data(survey_df, year, stratum_lookup=None): """Use an already defined stratum_lookup to load the survey SNAP data""" + if stratum_lookup is None: + raise ValueError("stratum_lookup must be provided") + DATABASE_URL = "sqlite:///policy_data.db" engine = create_engine(DATABASE_URL) - Session = sessionmaker(bind=engine) - session = Session() - - # Create new strata for districts whose households recieve SNAP benefits - district_df = survey_df.copy() - for _, row in district_df.iterrows(): - note = f"Geo: {row['ucgid_str']} Received SNAP Benefits" - state_ucgid_str = "0400000US" + row["ucgid_str"][9:11] - state_stratum_id = stratum_lookup["State"][state_ucgid_str] - new_stratum = Stratum( - parent_stratum_id=state_stratum_id, stratum_group_id=0, notes=note - ) + with Session(engine) as session: + # Create new strata for districts whose households recieve SNAP benefits + district_df = survey_df.copy() + for _, row in district_df.iterrows(): + note = f"Geo: {row['ucgid_str']} Received SNAP Benefits" + state_ucgid_str = "0400000US" + row["ucgid_str"][9:11] + state_stratum_id = stratum_lookup["State"][state_ucgid_str] + new_stratum = Stratum( + parent_stratum_id=state_stratum_id, + stratum_group_id=0, + notes=note, + ) - new_stratum.constraints_rel = [ - StratumConstraint( - constraint_variable="ucgid_str", - operation="in", - value=row["ucgid_str"], - ), - StratumConstraint( - constraint_variable="snap", - operation="greater_than", - value="0", - ), - ] - new_stratum.targets_rel.append( - Target( - variable="household_count", - period=year, - value=row["snap_household_ct"], - source_id=4, - active=True, + new_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", + operation="in", + value=row["ucgid_str"], + ), + StratumConstraint( + constraint_variable="snap", + operation="greater_than", + value="0", + ), + ] + new_stratum.targets_rel.append( + Target( + variable="household_count", + period=year, + value=row["snap_household_ct"], + source_id=4, + active=True, + ) ) - ) - session.add(new_stratum) - session.flush() + session.add(new_stratum) + session.flush() - session.commit() + session.commit() return stratum_lookup From d5b3571cf6a592d14efccfe0ab7230bd12bc01c0 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 15 Aug 2025 08:50:06 -0400 Subject: [PATCH 16/27] storage file updates --- policyengine_us_data/storage/README.md | 6 + .../storage/district_mapping.csv | 1502 +++++++++++++++++ .../storage/upload_completed_datasets.py | 1 + 3 files changed, 1509 insertions(+) create mode 100644 policyengine_us_data/storage/district_mapping.csv diff --git a/policyengine_us_data/storage/README.md b/policyengine_us_data/storage/README.md index 55f98ed9..d2c1f054 100644 --- a/policyengine_us_data/storage/README.md +++ b/policyengine_us_data/storage/README.md @@ -9,3 +9,9 @@ • Source: MACPAC Enrollment Tables, FFY 2024 • Date: 2024 • Location: https://www.medicaid.gov/resources-for-states/downloads/eligib-oper-and-enrol-snap-december2024.pdf#page=26 + +- **district_mapping.csv** + • Source: created by the script `policyengine_us/storage/calibration_targets/make_district_mapping.py` + • Notes: this script is not part of `make data` because of the length of time it takes to run and the + likelhood of timeout errors. See the script for more notes, including an alternative source. Also, + once the IRS SOI updates their data in 2026, this mapping will likely be unncessesary. diff --git a/policyengine_us_data/storage/district_mapping.csv b/policyengine_us_data/storage/district_mapping.csv new file mode 100644 index 00000000..fb5eef63 --- /dev/null +++ b/policyengine_us_data/storage/district_mapping.csv @@ -0,0 +1,1502 @@ +code_old,code_new,proportion +5001800US0101,5001800US0102,0.5011459283535776 +5001800US0101,5001800US0101,0.4820707064326126 +5001800US0101,5001800US0107,0.016783365213809763 +5001800US0102,5001800US0101,0.4678169491253603 +5001800US0102,5001800US0102,0.39705192508930826 +5001800US0102,5001800US0106,0.1351311257853314 +5001800US0103,5001800US0103,0.8198603796970461 +5001800US0103,5001800US0102,0.180139620302954 +5001800US0104,5001800US0104,0.7692270137962715 +5001800US0104,5001800US0103,0.17447390203471147 +5001800US0104,5001800US0105,0.04907806643602609 +5001800US0104,5001800US0107,0.007221017732990997 +5001800US0105,5001800US0105,0.878817662367874 +5001800US0105,5001800US0104,0.12118233763212609 +5001800US0106,5001800US0106,0.7473312551163003 +5001800US0106,5001800US0107,0.22864246122636855 +5001800US0106,5001800US0104,0.024026283657331227 +5001800US0107,5001800US0107,0.8633782301260244 +5001800US0107,5001800US0102,0.08164855122567322 +5001800US0107,5001800US0104,0.034608328493330016 +5001800US0107,5001800US0106,0.020364890154972295 +5001800US0200,5001800US0200,1.0 +5001800US0401,5001800US0402,0.6948503590446475 +5001800US0401,5001800US0406,0.29812837069923254 +5001800US0401,5001800US0407,0.007021270256119934 +5001800US0402,5001800US0406,0.7492521226985535 +5001800US0402,5001800US0407,0.25074787730144654 +5001800US0403,5001800US0407,0.7754175253607597 +5001800US0403,5001800US0403,0.12104397018340192 +5001800US0403,5001800US0409,0.08970929465623763 +5001800US0403,5001800US0406,0.013829209799600803 +5001800US0403,5001800US0402,0.0 +5001800US0404,5001800US0409,0.46093471548467213 +5001800US0404,5001800US0402,0.32851231142675985 +5001800US0404,5001800US0405,0.19748931684056112 +5001800US0404,5001800US0401,0.007724890354579596 +5001800US0404,5001800US0407,0.005338765893427278 +5001800US0404,5001800US0406,0.0 +5001800US0404,5001800US0408,0.0 +5001800US0405,5001800US0405,0.603375246029751 +5001800US0405,5001800US0404,0.3889039941038361 +5001800US0405,5001800US0401,0.007720759866412899 +5001800US0406,5001800US0401,0.7549732899257154 +5001800US0406,5001800US0408,0.24332248142885407 +5001800US0406,5001800US0404,0.0017042286454305089 +5001800US0407,5001800US0403,0.8274880581174503 +5001800US0407,5001800US0409,0.0731090792934076 +5001800US0407,5001800US0408,0.049613133214102066 +5001800US0407,5001800US0401,0.041687081990827386 +5001800US0407,5001800US0407,0.008038587992499856 +5001800US0407,5001800US0404,6.405939171279232e-05 +5001800US0408,5001800US0408,0.7247876203185409 +5001800US0408,5001800US0409,0.27432545665102437 +5001800US0408,5001800US0403,0.0008836562789413765 +5001800US0408,5001800US0401,3.2667514933137765e-06 +5001800US0408,5001800US0407,0.0 +5001800US0409,5001800US0404,0.6942271093254387 +5001800US0409,5001800US0401,0.23192258506278385 +5001800US0409,5001800US0403,0.05066437655628968 +5001800US0409,5001800US0408,0.023060778551217485 +5001800US0409,5001800US0405,0.0001251505042702215 +5001800US0409,5001800US0402,0.0 +5001800US0501,5001800US0501,0.9636508992922538 +5001800US0501,5001800US0502,0.030829978572819945 +5001800US0501,5001800US0504,0.005519122134926303 +5001800US0502,5001800US0502,0.9185648948063828 +5001800US0502,5001800US0504,0.06575058440898465 +5001800US0502,5001800US0501,0.015684520784632585 +5001800US0503,5001800US0503,0.8207885304659498 +5001800US0503,5001800US0501,0.11114866688730685 +5001800US0503,5001800US0504,0.06806280264674333 +5001800US0504,5001800US0504,0.937181462111058 +5001800US0504,5001800US0503,0.06281853788894193 +5001800US0601,5001800US0601,0.7902007441225233 +5001800US0601,5001800US0603,0.20979925587747675 +5001800US0602,5001800US0602,0.9992455338468796 +5001800US0602,5001800US0604,0.0007544661531203383 +5001800US0603,5001800US0604,0.44723347944752295 +5001800US0603,5001800US0601,0.25560513928793605 +5001800US0603,5001800US0608,0.2287328339799995 +5001800US0603,5001800US0606,0.0501127320716451 +5001800US0603,5001800US0607,0.01653583513428961 +5001800US0603,5001800US0603,0.001779980078606797 +5001800US0604,5001800US0603,0.6014165640096494 +5001800US0604,5001800US0605,0.38940702976059494 +5001800US0604,5001800US0620,0.009176406229755659 +5001800US0604,5001800US0613,0.0 +5001800US0605,5001800US0604,0.5395799676898223 +5001800US0605,5001800US0608,0.3792849775928005 +5001800US0605,5001800US0602,0.06123965207232369 +5001800US0605,5001800US0610,0.01989540264505353 +5001800US0606,5001800US0607,0.5752703267757333 +5001800US0606,5001800US0606,0.4243660728533943 +5001800US0606,5001800US0604,0.0003636003708723783 +5001800US0607,5001800US0606,0.5304593199540961 +5001800US0607,5001800US0607,0.2849996108440379 +5001800US0607,5001800US0603,0.18454106920186597 +5001800US0608,5001800US0623,0.7911347146383424 +5001800US0608,5001800US0633,0.09254544094547086 +5001800US0608,5001800US0603,0.07483581419702297 +5001800US0608,5001800US0628,0.02122223053332336 +5001800US0608,5001800US0625,0.020261799685840378 +5001800US0609,5001800US0609,0.7449549952013407 +5001800US0609,5001800US0610,0.12939921639709387 +5001800US0609,5001800US0608,0.058466694739002095 +5001800US0609,5001800US0607,0.03610857581347429 +5001800US0609,5001800US0613,0.03107051784908911 +5001800US0610,5001800US0605,0.46922839800977745 +5001800US0610,5001800US0613,0.36093406736538647 +5001800US0610,5001800US0609,0.1698375346248361 +5001800US0611,5001800US0610,0.5421216175756663 +5001800US0611,5001800US0608,0.4577227617261042 +5001800US0611,5001800US0609,0.00015562069822947625 +5001800US0612,5001800US0611,0.8824556656398301 +5001800US0612,5001800US0615,0.11754433436016984 +5001800US0613,5001800US0612,0.968319632028912 +5001800US0613,5001800US0614,0.03168036797108799 +5001800US0613,5001800US0617,0.0 +5001800US0614,5001800US0615,0.8147246523351065 +5001800US0614,5001800US0616,0.10011027095148078 +5001800US0614,5001800US0611,0.08516507671341275 +5001800US0615,5001800US0614,0.879209193527972 +5001800US0615,5001800US0610,0.11949548033093281 +5001800US0615,5001800US0617,0.001295326141095208 +5001800US0615,5001800US0612,0.0 +5001800US0616,5001800US0621,0.576124338019409 +5001800US0616,5001800US0613,0.4206240681965774 +5001800US0616,5001800US0620,0.002105990066861183 +5001800US0616,5001800US0605,0.0011456037171525002 +5001800US0617,5001800US0617,0.8767390155851017 +5001800US0617,5001800US0614,0.11236828974787944 +5001800US0617,5001800US0616,0.009779211059258606 +5001800US0617,5001800US0618,0.0011134836077602653 +5001800US0618,5001800US0616,0.7235221984095743 +5001800US0618,5001800US0619,0.10660128717891398 +5001800US0618,5001800US0615,0.08535457356731285 +5001800US0618,5001800US0617,0.07351142602123051 +5001800US0618,5001800US0618,0.011010514822968352 +5001800US0619,5001800US0618,0.4510363311700647 +5001800US0619,5001800US0619,0.32974361985417794 +5001800US0619,5001800US0616,0.1975073480905326 +5001800US0619,5001800US0617,0.021712700885224725 +5001800US0620,5001800US0618,0.5925036040365209 +5001800US0620,5001800US0619,0.4074963959634791 +5001800US0621,5001800US0622,0.5329178551896014 +5001800US0621,5001800US0621,0.18150259864876814 +5001800US0621,5001800US0613,0.17215597787464262 +5001800US0621,5001800US0620,0.11342356828698776 +5001800US0622,5001800US0620,0.41246978264039647 +5001800US0622,5001800US0621,0.321831053895241 +5001800US0622,5001800US0605,0.1605561021095157 +5001800US0622,5001800US0622,0.10514306135484679 +5001800US0623,5001800US0620,0.4311170291854314 +5001800US0623,5001800US0622,0.3619284258105871 +5001800US0623,5001800US0627,0.16211411824180327 +5001800US0623,5001800US0623,0.03673538167724541 +5001800US0623,5001800US0621,0.00810504508493281 +5001800US0624,5001800US0624,0.909745972391165 +5001800US0624,5001800US0619,0.09025402760883502 +5001800US0624,5001800US0626,0.0 +5001800US0625,5001800US0627,0.7963998249467975 +5001800US0625,5001800US0626,0.16642496739205054 +5001800US0625,5001800US0623,0.03555339122674538 +5001800US0625,5001800US0632,0.001615380654904922 +5001800US0625,5001800US0628,6.435779501613235e-06 +5001800US0626,5001800US0626,0.8517580843638418 +5001800US0626,5001800US0624,0.14824191563615824 +5001800US0627,5001800US0628,0.9333023507342876 +5001800US0627,5001800US0631,0.04593987122505196 +5001800US0627,5001800US0630,0.010313825562334173 +5001800US0627,5001800US0635,0.009475605428151353 +5001800US0627,5001800US0638,0.0009683470501749108 +5001800US0627,5001800US0634,0.0 +5001800US0628,5001800US0630,0.9093157999708373 +5001800US0628,5001800US0628,0.06209304235460142 +5001800US0628,5001800US0634,0.016399816979048125 +5001800US0628,5001800US0632,0.009668936551060667 +5001800US0628,5001800US0629,0.0022592696257288576 +5001800US0628,5001800US0627,0.0002631345187236132 +5001800US0629,5001800US0629,0.9349816769417437 +5001800US0629,5001800US0632,0.043869984424961504 +5001800US0629,5001800US0627,0.019276524536392385 +5001800US0629,5001800US0630,0.001871814096902376 +5001800US0630,5001800US0632,0.7750581173919288 +5001800US0630,5001800US0629,0.11788517210448435 +5001800US0630,5001800US0627,0.06792398640562922 +5001800US0630,5001800US0630,0.038539583302982565 +5001800US0630,5001800US0626,0.0005931407949751609 +5001800US0631,5001800US0633,0.7205763667291623 +5001800US0631,5001800US0635,0.1382415144629724 +5001800US0631,5001800US0628,0.07333507162161602 +5001800US0631,5001800US0623,0.06784704718624932 +5001800US0632,5001800US0631,0.9862601829590875 +5001800US0632,5001800US0638,0.0120510508192493 +5001800US0632,5001800US0628,0.0016887662216632747 +5001800US0632,5001800US0635,0.0 +5001800US0633,5001800US0636,0.7322890602472397 +5001800US0633,5001800US0632,0.11928339549982792 +5001800US0633,5001800US0630,0.07530486386932944 +5001800US0633,5001800US0644,0.04079176239686718 +5001800US0633,5001800US0626,0.03222115769205726 +5001800US0633,5001800US0643,0.00010976029467848605 +5001800US0634,5001800US0634,0.9419729727306151 +5001800US0634,5001800US0637,0.037979859572979904 +5001800US0634,5001800US0630,0.01977097662239838 +5001800US0634,5001800US0628,0.0002761910740066537 +5001800US0635,5001800US0635,0.817698648971904 +5001800US0635,5001800US0633,0.17846596699720388 +5001800US0635,5001800US0638,0.003835384030892129 +5001800US0635,5001800US0631,0.0 +5001800US0635,5001800US0640,0.0 +5001800US0636,5001800US0625,0.7774587938511587 +5001800US0636,5001800US0641,0.21970444069455705 +5001800US0636,5001800US0648,0.0028367654542842463 +5001800US0637,5001800US0637,0.7674439153156702 +5001800US0637,5001800US0636,0.20994549380477542 +5001800US0637,5001800US0634,0.00766878788603953 +5001800US0637,5001800US0630,0.007550976454280881 +5001800US0637,5001800US0632,0.006917740008578145 +5001800US0637,5001800US0643,0.0004730865306558231 +5001800US0638,5001800US0638,0.6887156260676834 +5001800US0638,5001800US0645,0.153737658674189 +5001800US0638,5001800US0642,0.09233370419364165 +5001800US0638,5001800US0644,0.05784019132887717 +5001800US0638,5001800US0631,0.005413976469351461 +5001800US0638,5001800US0634,0.0011844168586671805 +5001800US0638,5001800US0628,0.0007744264075900797 +5001800US0639,5001800US0645,0.35148980358066995 +5001800US0639,5001800US0638,0.3314724177470801 +5001800US0639,5001800US0640,0.20141183792614742 +5001800US0639,5001800US0646,0.10893543902612272 +5001800US0639,5001800US0635,0.005736648593066638 +5001800US0639,5001800US0631,0.0009538531269132028 +5001800US0640,5001800US0642,0.4998035349610858 +5001800US0640,5001800US0637,0.2331520216516807 +5001800US0640,5001800US0634,0.15496950827355685 +5001800US0640,5001800US0644,0.09446356234285 +5001800US0640,5001800US0638,0.015750681460572197 +5001800US0640,5001800US0643,0.001836023054247257 +5001800US0640,5001800US0628,2.4668256007160842e-05 +5001800US0641,5001800US0639,0.9629794917777905 +5001800US0641,5001800US0641,0.03549452538231834 +5001800US0641,5001800US0625,0.0015259828398910835 +5001800US0642,5001800US0641,0.7325250092756586 +5001800US0642,5001800US0648,0.14663007740216222 +5001800US0642,5001800US0635,0.03994483608336192 +5001800US0642,5001800US0639,0.03684828289475219 +5001800US0642,5001800US0640,0.023671013975325582 +5001800US0642,5001800US0625,0.020380780368739512 +5001800US0643,5001800US0643,0.7911076363450807 +5001800US0643,5001800US0636,0.1290098010271214 +5001800US0643,5001800US0637,0.05355938751781425 +5001800US0643,5001800US0644,0.026323175109983634 +5001800US0644,5001800US0644,0.6706043411251857 +5001800US0644,5001800US0643,0.27499939117093486 +5001800US0644,5001800US0642,0.05439626770387946 +5001800US0645,5001800US0640,0.8173764937841915 +5001800US0645,5001800US0647,0.1781247975118253 +5001800US0645,5001800US0649,0.003175014579148578 +5001800US0645,5001800US0646,0.0013236941248345381 +5001800US0646,5001800US0646,0.8446515657194924 +5001800US0646,5001800US0645,0.08557874500843862 +5001800US0646,5001800US0640,0.069769689272069 +5001800US0646,5001800US0647,0.0 +5001800US0647,5001800US0642,0.41273816684764353 +5001800US0647,5001800US0645,0.3861931582935575 +5001800US0647,5001800US0644,0.13927145747564623 +5001800US0647,5001800US0646,0.0574975197056457 +5001800US0647,5001800US0647,0.00429969767750705 +5001800US0647,5001800US0636,0.0 +5001800US0648,5001800US0647,0.6061644354137836 +5001800US0648,5001800US0645,0.17512332201809025 +5001800US0648,5001800US0649,0.0871521016978921 +5001800US0648,5001800US0640,0.07501824469389795 +5001800US0648,5001800US0646,0.0565418961763361 +5001800US0649,5001800US0649,0.9467849127978065 +5001800US0649,5001800US0650,0.049865068699184355 +5001800US0649,5001800US0640,0.0033283775566073054 +5001800US0649,5001800US0647,2.1640946401868047e-05 +5001800US0649,5001800US0648,0.0 +5001800US0650,5001800US0648,0.7026490283613446 +5001800US0650,5001800US0650,0.22277524290966386 +5001800US0650,5001800US0651,0.07349453453256302 +5001800US0650,5001800US0649,0.0010811941964285715 +5001800US0650,5001800US0652,0.0 +5001800US0651,5001800US0652,0.8287402679957188 +5001800US0651,5001800US0625,0.1625106624192056 +5001800US0651,5001800US0648,0.008269147531661273 +5001800US0651,5001800US0651,0.0003531501902483461 +5001800US0651,5001800US0650,0.00012677186316607296 +5001800US0652,5001800US0651,0.4962193959357763 +5001800US0652,5001800US0650,0.411860889626181 +5001800US0652,5001800US0648,0.09191773246998805 +5001800US0652,5001800US0652,1.981968054638895e-06 +5001800US0653,5001800US0651,0.596359860966912 +5001800US0653,5001800US0652,0.24744097864849005 +5001800US0653,5001800US0650,0.12459892565341037 +5001800US0653,5001800US0648,0.03160023473118765 +5001800US0801,5001800US0801,0.8051210712426631 +5001800US0801,5001800US0806,0.1736981924575039 +5001800US0801,5001800US0807,0.021180736299833067 +5001800US0802,5001800US0802,0.6336205761200139 +5001800US0802,5001800US0807,0.25830170826047577 +5001800US0802,5001800US0804,0.09109034065927386 +5001800US0802,5001800US0808,0.014904476408337267 +5001800US0802,5001800US0806,0.0020828985518992265 +5001800US0803,5001800US0803,0.9294525872910616 +5001800US0803,5001800US0802,0.06644105519096945 +5001800US0803,5001800US0807,0.004106357517968887 +5001800US0804,5001800US0804,0.5029337358528304 +5001800US0804,5001800US0808,0.291207855380495 +5001800US0804,5001800US0802,0.15484187515332773 +5001800US0804,5001800US0803,0.050532135150296344 +5001800US0804,5001800US0806,0.000484398463050554 +5001800US0804,5001800US0807,0.0 +5001800US0805,5001800US0805,0.8312852988063479 +5001800US0805,5001800US0807,0.16231261698785063 +5001800US0805,5001800US0804,0.006402084205801441 +5001800US0806,5001800US0806,0.8110107505768004 +5001800US0806,5001800US0804,0.14431790290118304 +5001800US0806,5001800US0808,0.04467134652201659 +5001800US0807,5001800US0807,0.5448955821367417 +5001800US0807,5001800US0808,0.4551044178632583 +5001800US0807,5001800US0801,0.0 +5001800US0901,5001800US0901,0.9672058926598738 +5001800US0901,5001800US0902,0.03089007332450013 +5001800US0901,5001800US0905,0.001904034015626095 +5001800US0902,5001800US0902,1.0 +5001800US0903,5001800US0903,0.9781755317954713 +5001800US0903,5001800US0901,0.019992090206190017 +5001800US0903,5001800US0905,0.0018323779983385978 +5001800US0904,5001800US0904,0.9696486259458347 +5001800US0904,5001800US0903,0.030351374054165325 +5001800US0905,5001800US0905,0.9878146024980679 +5001800US0905,5001800US0901,0.007909204505424981 +5001800US0905,5001800US0903,0.004276192996507192 +5001800US1000,5001800US1000,1.0 +5001800US1198,5001800US1198,1.0 +5001800US1201,5001800US1201,0.9747987495611197 +5001800US1201,5001800US1202,0.025201250438880357 +5001800US1202,5001800US1202,0.6993820113510333 +5001800US1202,5001800US1203,0.3006179886489667 +5001800US1203,5001800US1203,0.5792179712439726 +5001800US1203,5001800US1206,0.2533873174390422 +5001800US1203,5001800US1204,0.16739471131698533 +5001800US1204,5001800US1205,0.7624656298645562 +5001800US1204,5001800US1204,0.2245951961825509 +5001800US1204,5001800US1206,0.01293917395289291 +5001800US1205,5001800US1204,0.39587483354951214 +5001800US1205,5001800US1202,0.36349765080899793 +5001800US1205,5001800US1205,0.15146400892627834 +5001800US1205,5001800US1203,0.08916350671521157 +5001800US1206,5001800US1206,0.6101570207856056 +5001800US1206,5001800US1207,0.38717161920369225 +5001800US1206,5001800US1211,0.00237036169963713 +5001800US1206,5001800US1205,0.00030099831106503237 +5001800US1207,5001800US1207,0.7243416431150246 +5001800US1207,5001800US1210,0.25373695130119417 +5001800US1207,5001800US1209,0.021921405583781223 +5001800US1208,5001800US1208,1.0 +5001800US1209,5001800US1218,0.5605197858168995 +5001800US1209,5001800US1209,0.3590995167820295 +5001800US1209,5001800US1210,0.07936855165208306 +5001800US1209,5001800US1211,0.0010121457489878543 +5001800US1210,5001800US1210,0.5368456034843628 +5001800US1210,5001800US1211,0.2794200823797457 +5001800US1210,5001800US1209,0.18373431413589142 +5001800US1211,5001800US1212,0.5453889102810936 +5001800US1211,5001800US1211,0.2894787507853191 +5001800US1211,5001800US1206,0.13233133372514846 +5001800US1211,5001800US1203,0.032801005208438885 +5001800US1212,5001800US1212,0.5772534157521615 +5001800US1212,5001800US1213,0.23956926250122584 +5001800US1212,5001800US1215,0.17647868298593206 +5001800US1212,5001800US1214,0.006698638760680651 +5001800US1213,5001800US1213,0.7453819831827065 +5001800US1213,5001800US1214,0.2546180168172935 +5001800US1214,5001800US1214,0.6632249170480902 +5001800US1214,5001800US1215,0.3367750829519099 +5001800US1215,5001800US1215,0.5084681430291608 +5001800US1215,5001800US1216,0.19446251879108797 +5001800US1215,5001800US1218,0.17809650953203646 +5001800US1215,5001800US1211,0.060903692402043276 +5001800US1215,5001800US1214,0.05644244504775367 +5001800US1215,5001800US1209,0.0016266911979178353 +5001800US1216,5001800US1216,0.6709346399186465 +5001800US1216,5001800US1217,0.2958556747095621 +5001800US1216,5001800US1214,0.033209685371791316 +5001800US1217,5001800US1217,0.5462582449029376 +5001800US1217,5001800US1218,0.4537417550970623 +5001800US1218,5001800US1221,0.95046338878938 +5001800US1218,5001800US1220,0.04953661121062002 +5001800US1218,5001800US1222,0.0 +5001800US1219,5001800US1219,0.8897051787745729 +5001800US1219,5001800US1217,0.09399619379737818 +5001800US1219,5001800US1226,0.01629862742804895 +5001800US1220,5001800US1220,0.8047201512884832 +5001800US1220,5001800US1222,0.07604962558569756 +5001800US1220,5001800US1224,0.07049154352527683 +5001800US1220,5001800US1221,0.0194666628254647 +5001800US1220,5001800US1223,0.016768904372591105 +5001800US1220,5001800US1225,0.01250311240248663 +5001800US1221,5001800US1222,0.887138729405424 +5001800US1221,5001800US1223,0.10680921618971338 +5001800US1221,5001800US1220,0.006052054404862603 +5001800US1222,5001800US1223,0.86342693421676 +5001800US1222,5001800US1220,0.09376805885037684 +5001800US1222,5001800US1225,0.0410140895909871 +5001800US1222,5001800US1222,0.0017909173418760554 +5001800US1223,5001800US1225,0.7424736452134326 +5001800US1223,5001800US1220,0.1538560090194546 +5001800US1223,5001800US1224,0.10367034576711273 +5001800US1223,5001800US1227,0.0 +5001800US1224,5001800US1224,0.7304388158425426 +5001800US1224,5001800US1226,0.14046409077464977 +5001800US1224,5001800US1225,0.11044786821184337 +5001800US1224,5001800US1227,0.0186492251709643 +5001800US1225,5001800US1226,0.6776329968288068 +5001800US1225,5001800US1227,0.11529185581505136 +5001800US1225,5001800US1228,0.10399597699773193 +5001800US1225,5001800US1218,0.08066530057916181 +5001800US1225,5001800US1219,0.020929190370793552 +5001800US1225,5001800US1224,0.0014846794084544623 +5001800US1226,5001800US1228,0.8915866534230953 +5001800US1226,5001800US1227,0.10841334657690468 +5001800US1227,5001800US1227,0.7776326837521148 +5001800US1227,5001800US1224,0.20028071934331726 +5001800US1227,5001800US1226,0.022086596904567955 +5001800US1227,5001800US1228,0.0 +5001800US1301,5001800US1301,0.9561881617774692 +5001800US1301,5001800US1308,0.0422120002314413 +5001800US1301,5001800US1312,0.00159983799108951 +5001800US1302,5001800US1302,0.9535883881372637 +5001800US1302,5001800US1308,0.0425193208077129 +5001800US1302,5001800US1303,0.00389229105502341 +5001800US1303,5001800US1303,0.9356156702384453 +5001800US1303,5001800US1302,0.03246398019680113 +5001800US1303,5001800US1310,0.02951333091719719 +5001800US1303,5001800US1306,0.0024070186475563973 +5001800US1304,5001800US1304,0.5292481140839385 +5001800US1304,5001800US1313,0.34044250327086667 +5001800US1304,5001800US1305,0.12428806972660905 +5001800US1304,5001800US1310,0.0060213129185858185 +5001800US1305,5001800US1305,0.8409159109359058 +5001800US1305,5001800US1306,0.09164795107729144 +5001800US1305,5001800US1304,0.054169391087904833 +5001800US1305,5001800US1313,0.013266746898897912 +5001800US1306,5001800US1311,0.35373415889203297 +5001800US1306,5001800US1304,0.2627546017426386 +5001800US1306,5001800US1307,0.2618954230054446 +5001800US1306,5001800US1305,0.07814324003772916 +5001800US1306,5001800US1306,0.04347257632215467 +5001800US1307,5001800US1307,0.5697457009305716 +5001800US1307,5001800US1304,0.2169990136798319 +5001800US1307,5001800US1313,0.11428448904326943 +5001800US1307,5001800US1309,0.09897079634632704 +5001800US1308,5001800US1308,0.8445285294339339 +5001800US1308,5001800US1302,0.15547147056606617 +5001800US1309,5001800US1309,0.5130770554046914 +5001800US1309,5001800US1307,0.28956600272784244 +5001800US1309,5001800US1310,0.19105955789292603 +5001800US1309,5001800US1311,0.006297383974540137 +5001800US1310,5001800US1310,0.6874925073082563 +5001800US1310,5001800US1312,0.2771235441123581 +5001800US1310,5001800US1308,0.03511959104504222 +5001800US1310,5001800US1313,0.00026435753434342503 +5001800US1311,5001800US1311,0.5762284303464563 +5001800US1311,5001800US1306,0.34235702976142557 +5001800US1311,5001800US1314,0.08141453989211819 +5001800US1312,5001800US1312,0.9080261599572386 +5001800US1312,5001800US1308,0.0680724848815151 +5001800US1312,5001800US1301,0.02390135516124637 +5001800US1313,5001800US1306,0.4755343460142492 +5001800US1313,5001800US1305,0.21698908699849687 +5001800US1313,5001800US1313,0.1867369758887436 +5001800US1313,5001800US1303,0.1207395910985103 +5001800US1314,5001800US1314,0.9248507449920454 +5001800US1314,5001800US1311,0.055233761568332276 +5001800US1314,5001800US1303,0.019915493439622286 +5001800US1501,5001800US1501,0.9875087703991304 +5001800US1501,5001800US1502,0.012491229600869588 +5001800US1502,5001800US1502,1.0 +5001800US1502,5001800US1501,0.0 +5001800US1601,5001800US1601,0.9915164906448278 +5001800US1601,5001800US1602,0.008483509355172204 +5001800US1602,5001800US1602,0.9999813547194701 +5001800US1602,5001800US1601,1.8645280529898873e-05 +5001800US1701,5001800US1701,0.7328664399960247 +5001800US1701,5001800US1706,0.16391961524214507 +5001800US1701,5001800US1707,0.05042568505630125 +5001800US1701,5001800US1702,0.0340563102878155 +5001800US1701,5001800US1704,0.014863591053235477 +5001800US1701,5001800US1714,0.0038683583644779597 +5001800US1702,5001800US1702,0.866871509273929 +5001800US1702,5001800US1701,0.13312849072607102 +5001800US1703,5001800US1706,0.42346040790348827 +5001800US1703,5001800US1704,0.38180285318279644 +5001800US1703,5001800US1701,0.08662255730722768 +5001800US1703,5001800US1714,0.05225556482250935 +5001800US1703,5001800US1707,0.03742106541145526 +5001800US1703,5001800US1711,0.018437551372522995 +5001800US1704,5001800US1704,0.5137089622251435 +5001800US1704,5001800US1703,0.4625259850064018 +5001800US1704,5001800US1707,0.022781258287399162 +5001800US1704,5001800US1705,0.00098379448105554 +5001800US1705,5001800US1705,0.5793785972791489 +5001800US1705,5001800US1703,0.20435952079881398 +5001800US1705,5001800US1704,0.09853002092962414 +5001800US1705,5001800US1707,0.05321520449986919 +5001800US1705,5001800US1706,0.037086040376733236 +5001800US1705,5001800US1709,0.017764291008982296 +5001800US1705,5001800US1708,0.00966632510682829 +5001800US1706,5001800US1706,0.2649640408149341 +5001800US1706,5001800US1708,0.17813077219242998 +5001800US1706,5001800US1703,0.16607018985994842 +5001800US1706,5001800US1709,0.14958111245225883 +5001800US1706,5001800US1705,0.14925295544375114 +5001800US1706,5001800US1711,0.07081855679146921 +5001800US1706,5001800US1704,0.02099717492555547 +5001800US1706,5001800US1710,0.00018519751965286838 +5001800US1707,5001800US1707,0.8356214674918557 +5001800US1707,5001800US1704,0.05429995574161812 +5001800US1707,5001800US1705,0.04485768390808768 +5001800US1707,5001800US1703,0.03863103746000421 +5001800US1707,5001800US1701,0.020530084816472824 +5001800US1707,5001800US1702,0.006059770581961444 +5001800US1708,5001800US1708,0.6857132588595797 +5001800US1708,5001800US1703,0.170406761704302 +5001800US1708,5001800US1706,0.09747842433163478 +5001800US1708,5001800US1709,0.025812671788885817 +5001800US1708,5001800US1705,0.013671231035835221 +5001800US1708,5001800US1710,0.006917652279762546 +5001800US1708,5001800US1704,0.0 +5001800US1709,5001800US1709,0.7062608912123475 +5001800US1709,5001800US1705,0.1787306724199928 +5001800US1709,5001800US1708,0.06752966558791801 +5001800US1709,5001800US1710,0.04747877077974166 +5001800US1710,5001800US1710,0.8249960175825701 +5001800US1710,5001800US1709,0.11204077995448215 +5001800US1710,5001800US1705,0.05610713570343687 +5001800US1710,5001800US1708,0.0068560667595108805 +5001800US1711,5001800US1711,0.5593862773123179 +5001800US1711,5001800US1714,0.32027369568135694 +5001800US1711,5001800US1706,0.08971903945924177 +5001800US1711,5001800US1701,0.030620987547083395 +5001800US1712,5001800US1712,0.5961217321481562 +5001800US1712,5001800US1713,0.3996578683491372 +5001800US1712,5001800US1715,0.004220399502706603 +5001800US1713,5001800US1713,0.5947948448680809 +5001800US1713,5001800US1715,0.33531142019670085 +5001800US1713,5001800US1717,0.059265515333313916 +5001800US1713,5001800US1716,0.010628219601904309 +5001800US1714,5001800US1711,0.44987551317288366 +5001800US1714,5001800US1714,0.21123631670458598 +5001800US1714,5001800US1710,0.20862351333697274 +5001800US1714,5001800US1716,0.04576508120832689 +5001800US1714,5001800US1703,0.04057103006932764 +5001800US1714,5001800US1709,0.022025806166594615 +5001800US1714,5001800US1708,0.021852250387344944 +5001800US1714,5001800US1701,5.0488953963540664e-05 +5001800US1715,5001800US1712,0.5253890530153864 +5001800US1715,5001800US1715,0.3359293734830502 +5001800US1715,5001800US1702,0.1347890821973043 +5001800US1715,5001800US1713,0.003779529041499511 +5001800US1715,5001800US1716,0.0001129622627596367 +5001800US1716,5001800US1716,0.4820501673237491 +5001800US1716,5001800US1714,0.2313222345126926 +5001800US1716,5001800US1717,0.10048715840624348 +5001800US1716,5001800US1702,0.09952826798445441 +5001800US1716,5001800US1701,0.045711200436146134 +5001800US1716,5001800US1711,0.04090097133671428 +5001800US1717,5001800US1717,0.8136161411561997 +5001800US1717,5001800US1716,0.14275098070808273 +5001800US1717,5001800US1715,0.04363287813571751 +5001800US1718,5001800US1715,0.43658111095457497 +5001800US1718,5001800US1716,0.34141233887645295 +5001800US1718,5001800US1717,0.1454151426131452 +5001800US1718,5001800US1713,0.07659140755582688 +5001800US1801,5001800US1801,0.988531203859671 +5001800US1801,5001800US1802,0.011468796140328988 +5001800US1802,5001800US1802,0.9614751108212379 +5001800US1802,5001800US1801,0.030442216351061046 +5001800US1802,5001800US1803,0.008082672827701072 +5001800US1803,5001800US1803,0.9597306037408526 +5001800US1803,5001800US1802,0.040269396259147425 +5001800US1804,5001800US1804,0.8313791776657714 +5001800US1804,5001800US1805,0.12645125968862403 +5001800US1804,5001800US1802,0.023428810986406735 +5001800US1804,5001800US1808,0.018740751659197804 +5001800US1805,5001800US1805,0.6400148982319304 +5001800US1805,5001800US1807,0.3104134259360691 +5001800US1805,5001800US1804,0.03134544179830423 +5001800US1805,5001800US1803,0.018226234033696297 +5001800US1806,5001800US1806,0.5011702888227366 +5001800US1806,5001800US1809,0.32791859893966063 +5001800US1806,5001800US1805,0.1407964956011474 +5001800US1806,5001800US1803,0.030114616636455326 +5001800US1807,5001800US1807,0.8034097347630396 +5001800US1807,5001800US1806,0.19659026523696044 +5001800US1808,5001800US1808,1.0 +5001800US1809,5001800US1809,0.7601494094636573 +5001800US1809,5001800US1806,0.15824668034071387 +5001800US1809,5001800US1808,0.051898563431577525 +5001800US1809,5001800US1804,0.029705346764051274 +5001800US1901,5001800US1902,0.8720655061774997 +5001800US1901,5001800US1901,0.07558665679428175 +5001800US1901,5001800US1904,0.05234783702821857 +5001800US1902,5001800US1901,0.8463125152936964 +5001800US1902,5001800US1903,0.15368748470630356 +5001800US1903,5001800US1903,0.7723673464179066 +5001800US1903,5001800US1904,0.18090719991832777 +5001800US1903,5001800US1901,0.0467254536637656 +5001800US1904,5001800US1904,0.8253842778468936 +5001800US1904,5001800US1902,0.1635109839452142 +5001800US1904,5001800US1903,0.011104738207892137 +5001800US2001,5001800US2001,0.8843389638576956 +5001800US2001,5001800US2002,0.11097291686326319 +5001800US2001,5001800US2004,0.004688119279041238 +5001800US2002,5001800US2002,0.7700740811652688 +5001800US2002,5001800US2001,0.15106556109486954 +5001800US2002,5001800US2003,0.0788603577398617 +5001800US2003,5001800US2003,0.8321204268744993 +5001800US2003,5001800US2002,0.1678795731255007 +5001800US2004,5001800US2004,1.0 +5001800US2101,5001800US2101,0.9002669684401693 +5001800US2101,5001800US2102,0.09973303155983068 +5001800US2102,5001800US2102,0.8524362905311212 +5001800US2102,5001800US2101,0.06378432398065823 +5001800US2102,5001800US2106,0.06038959650183638 +5001800US2102,5001800US2104,0.02338978898638415 +5001800US2103,5001800US2103,0.9897572842907433 +5001800US2103,5001800US2102,0.010242715709256708 +5001800US2104,5001800US2104,0.9288639599361822 +5001800US2104,5001800US2105,0.05473830922861576 +5001800US2104,5001800US2102,0.012965229702017368 +5001800US2104,5001800US2103,0.0034325011331846384 +5001800US2105,5001800US2105,0.9925275551511651 +5001800US2105,5001800US2104,0.007472444848834865 +5001800US2106,5001800US2106,0.9323193563898508 +5001800US2106,5001800US2105,0.030919473416900174 +5001800US2106,5001800US2101,0.02025572148334682 +5001800US2106,5001800US2104,0.016505448709902303 +5001800US2201,5001800US2201,0.8325783943568084 +5001800US2201,5001800US2202,0.07545747116388095 +5001800US2201,5001800US2203,0.05912838698044444 +5001800US2201,5001800US2205,0.032835747498866225 +5001800US2202,5001800US2202,0.8091613356906828 +5001800US2202,5001800US2206,0.15022270954110004 +5001800US2202,5001800US2201,0.03794810609405958 +5001800US2202,5001800US2205,0.002667848674157528 +5001800US2203,5001800US2203,0.8182390505997069 +5001800US2203,5001800US2204,0.11466251786457297 +5001800US2203,5001800US2206,0.06709843153572011 +5001800US2204,5001800US2204,0.6867822356979522 +5001800US2204,5001800US2206,0.3132177643020479 +5001800US2205,5001800US2205,0.5912365355773487 +5001800US2205,5001800US2204,0.22153779067846815 +5001800US2205,5001800US2206,0.18722567374418314 +5001800US2206,5001800US2206,0.4288051906586389 +5001800US2206,5001800US2205,0.2877383232986681 +5001800US2206,5001800US2203,0.11320356686735997 +5001800US2206,5001800US2202,0.11290993887976647 +5001800US2206,5001800US2201,0.057342980295566504 +5001800US2301,5001800US2301,0.941166511470414 +5001800US2301,5001800US2302,0.05883348852958605 +5001800US2302,5001800US2302,0.9799833209581671 +5001800US2302,5001800US2301,0.02001667904183294 +5001800US2401,5001800US2401,0.8710774342335513 +5001800US2401,5001800US2402,0.12892256576644867 +5001800US2402,5001800US2402,0.4366817998188828 +5001800US2402,5001800US2407,0.26050547631884446 +5001800US2402,5001800US2401,0.14073831560957367 +5001800US2402,5001800US2403,0.13744035971147392 +5001800US2402,5001800US2405,0.0246340485412252 +5001800US2403,5001800US2403,0.3098054988636244 +5001800US2403,5001800US2407,0.2845048950213751 +5001800US2403,5001800US2402,0.22175483091363193 +5001800US2403,5001800US2408,0.11619308900964953 +5001800US2403,5001800US2404,0.03769359660246525 +5001800US2403,5001800US2405,0.030048089589253785 +5001800US2404,5001800US2404,0.7260962321123802 +5001800US2404,5001800US2405,0.14302711041092295 +5001800US2404,5001800US2403,0.1308471182880399 +5001800US2404,5001800US2408,2.9539188656951555e-05 +5001800US2405,5001800US2405,0.6828384316140046 +5001800US2405,5001800US2404,0.31716156838599535 +5001800US2405,5001800US2403,0.0 +5001800US2406,5001800US2406,0.763907930400518 +5001800US2406,5001800US2408,0.23609206959948204 +5001800US2407,5001800US2407,0.6091312221019334 +5001800US2407,5001800US2403,0.22031060524784374 +5001800US2407,5001800US2402,0.14582256423260606 +5001800US2407,5001800US2401,0.024735608417616784 +5001800US2408,5001800US2408,0.6534096708918524 +5001800US2408,5001800US2406,0.19547588882017042 +5001800US2408,5001800US2402,0.12317537423217105 +5001800US2408,5001800US2403,0.027939066055806077 +5001800US2408,5001800US2404,0.0 +5001800US2501,5001800US2501,0.9788216495205518 +5001800US2501,5001800US2502,0.021178350479448205 +5001800US2502,5001800US2502,0.8937180011792643 +5001800US2502,5001800US2501,0.07845007297906001 +5001800US2502,5001800US2504,0.027831925841675716 +5001800US2503,5001800US2503,0.9435368111458736 +5001800US2503,5001800US2506,0.03571847060615648 +5001800US2503,5001800US2505,0.01977800822582346 +5001800US2503,5001800US2502,0.0009667100221464477 +5001800US2504,5001800US2504,0.9128553617716668 +5001800US2504,5001800US2505,0.03787473233404711 +5001800US2504,5001800US2508,0.03331384537360532 +5001800US2504,5001800US2502,0.015862729629212216 +5001800US2504,5001800US2509,9.333089146849995e-05 +5001800US2505,5001800US2505,0.9431400489978259 +5001800US2505,5001800US2502,0.04449642731279783 +5001800US2505,5001800US2504,0.007218581718941797 +5001800US2505,5001800US2507,0.0051449419704344595 +5001800US2505,5001800US2508,0.0 +5001800US2506,5001800US2506,0.9606618898017568 +5001800US2506,5001800US2503,0.03818849125728009 +5001800US2506,5001800US2505,0.0011496189409631033 +5001800US2507,5001800US2507,0.9583135106287732 +5001800US2507,5001800US2508,0.03649893879041418 +5001800US2507,5001800US2505,0.005187550580812559 +5001800US2508,5001800US2508,0.944787405218379 +5001800US2508,5001800US2509,0.04152162405656574 +5001800US2508,5001800US2507,0.00965431657324345 +5001800US2508,5001800US2504,0.004036654151811849 +5001800US2508,5001800US2505,0.0 +5001800US2509,5001800US2509,0.9300874102769248 +5001800US2509,5001800US2504,0.06991258972307526 +5001800US2509,5001800US2508,0.0 +5001800US2601,5001800US2601,0.9404095360933812 +5001800US2601,5001800US2602,0.05959046390661885 +5001800US2602,5001800US2603,0.486529033934378 +5001800US2602,5001800US2602,0.3071689764664076 +5001800US2602,5001800US2604,0.2063019895992144 +5001800US2603,5001800US2603,0.5487075481609025 +5001800US2603,5001800US2602,0.23468724570740748 +5001800US2603,5001800US2604,0.13520842866987057 +5001800US2603,5001800US2605,0.08139677746181939 +5001800US2604,5001800US2602,0.48116656595311563 +5001800US2604,5001800US2607,0.23333636365256855 +5001800US2604,5001800US2608,0.19860177505405555 +5001800US2604,5001800US2601,0.08689529534026029 +5001800US2605,5001800US2608,0.9182020676890162 +5001800US2605,5001800US2601,0.05437712451879429 +5001800US2605,5001800US2609,0.027063642313728525 +5001800US2605,5001800US2607,0.00035716547846098627 +5001800US2606,5001800US2604,0.6152097283555882 +5001800US2606,5001800US2605,0.3847902716444118 +5001800US2607,5001800US2605,0.709607595652092 +5001800US2607,5001800US2607,0.17329276792946893 +5001800US2607,5001800US2606,0.10808356724375036 +5001800US2607,5001800US2602,0.009016069174688696 +5001800US2608,5001800US2607,0.6547975192781864 +5001800US2608,5001800US2609,0.23932954941294593 +5001800US2608,5001800US2610,0.10573974277537583 +5001800US2608,5001800US2611,0.00013318853349179353 +5001800US2609,5001800US2610,0.7095396396101716 +5001800US2609,5001800US2611,0.26760158451704424 +5001800US2609,5001800US2612,0.022858775872784145 +5001800US2610,5001800US2609,0.7811212928925769 +5001800US2610,5001800US2610,0.21887870710742313 +5001800US2611,5001800US2611,0.5101384973032931 +5001800US2611,5001800US2606,0.27019631562775975 +5001800US2611,5001800US2612,0.1290129337582104 +5001800US2611,5001800US2607,0.04340239196263452 +5001800US2611,5001800US2609,0.03674516844031591 +5001800US2611,5001800US2610,0.010504692907786342 +5001800US2612,5001800US2606,0.5760270363496696 +5001800US2612,5001800US2613,0.27983553028363195 +5001800US2612,5001800US2612,0.1441374333666985 +5001800US2613,5001800US2612,0.6246669596506167 +5001800US2613,5001800US2613,0.37533304034938336 +5001800US2614,5001800US2613,0.41190846300540057 +5001800US2614,5001800US2611,0.36712753000753195 +5001800US2614,5001800US2612,0.2209640069870675 +5001800US2701,5001800US2701,0.9416988690444315 +5001800US2701,5001800US2702,0.04513284884373742 +5001800US2701,5001800US2707,0.01316828211183107 +5001800US2702,5001800US2702,0.8406623603360881 +5001800US2702,5001800US2701,0.15933763966391196 +5001800US2703,5001800US2703,0.9515062148633415 +5001800US2703,5001800US2706,0.04849378513665852 +5001800US2703,5001800US2705,0.0 +5001800US2704,5001800US2704,0.9872695337649051 +5001800US2704,5001800US2702,0.010586497371606685 +5001800US2704,5001800US2708,0.0021439688634881477 +5001800US2705,5001800US2705,0.9848856987648165 +5001800US2705,5001800US2703,0.015114301235183586 +5001800US2706,5001800US2706,0.8292238057785354 +5001800US2706,5001800US2708,0.07227300687658564 +5001800US2706,5001800US2707,0.052240016403426355 +5001800US2706,5001800US2703,0.046263170941452623 +5001800US2707,5001800US2707,0.9425849699683243 +5001800US2707,5001800US2708,0.05734855249659078 +5001800US2707,5001800US2706,6.647753508486665e-05 +5001800US2708,5001800US2708,0.870002069399742 +5001800US2708,5001800US2707,0.12999793060025802 +5001800US2801,5001800US2801,0.9572797698899 +5001800US2801,5001800US2803,0.04272023011009995 +5001800US2802,5001800US2802,0.9960763018430357 +5001800US2802,5001800US2803,0.003923698156964258 +5001800US2803,5001800US2803,0.8655147881482269 +5001800US2803,5001800US2802,0.13448521185177312 +5001800US2804,5001800US2804,0.8733773563607631 +5001800US2804,5001800US2803,0.12662264363923692 +5001800US2901,5001800US2901,0.9769804108589307 +5001800US2901,5001800US2902,0.023019589141069274 +5001800US2902,5001800US2902,0.6755620222021533 +5001800US2902,5001800US2903,0.1577787440583039 +5001800US2902,5001800US2901,0.10836278630706166 +5001800US2902,5001800US2908,0.05829644743248115 +5001800US2903,5001800US2903,0.6891206851164442 +5001800US2903,5001800US2902,0.15215873989581244 +5001800US2903,5001800US2908,0.13246115910457637 +5001800US2903,5001800US2904,0.0161691151786814 +5001800US2903,5001800US2906,0.01009030070448557 +5001800US2904,5001800US2904,0.7471718789325862 +5001800US2904,5001800US2903,0.14004405610136914 +5001800US2904,5001800US2906,0.09228840041606065 +5001800US2904,5001800US2907,0.020495664549984076 +5001800US2905,5001800US2905,0.8467785096552989 +5001800US2905,5001800US2904,0.12422438948743988 +5001800US2905,5001800US2906,0.02899710085726119 +5001800US2906,5001800US2906,0.8902455639134994 +5001800US2906,5001800US2905,0.07760777481442337 +5001800US2906,5001800US2904,0.03214666127207722 +5001800US2907,5001800US2907,1.0 +5001800US2908,5001800US2908,0.9008372927864159 +5001800US2908,5001800US2903,0.09916270721358411 +5001800US3000,5001800US3002,0.6106969801944205 +5001800US3000,5001800US3001,0.3893030198055794 +5001800US3101,5001800US3101,0.8532079757052351 +5001800US3101,5001800US3103,0.10288864440171386 +5001800US3101,5001800US3102,0.04390337989305108 +5001800US3102,5001800US3102,0.9112527509181813 +5001800US3102,5001800US3101,0.08874724908181875 +5001800US3103,5001800US3103,1.0 +5001800US3201,5001800US3201,0.5651471999748242 +5001800US3201,5001800US3203,0.24891507875316665 +5001800US3201,5001800US3204,0.1859377212720092 +5001800US3202,5001800US3202,0.9999979321753515 +5001800US3202,5001800US3204,2.0678246484698098e-06 +5001800US3203,5001800US3203,0.596842989599895 +5001800US3203,5001800US3201,0.39709212860480453 +5001800US3203,5001800US3204,0.006064881795300441 +5001800US3204,5001800US3204,0.7958110422121486 +5001800US3204,5001800US3203,0.13791955636236355 +5001800US3204,5001800US3201,0.05101764187104722 +5001800US3204,5001800US3202,0.015251759554440665 +5001800US3301,5001800US3301,0.9951414538029315 +5001800US3301,5001800US3302,0.00485854619706852 +5001800US3302,5001800US3302,1.0 +5001800US3401,5001800US3401,0.9874253792650199 +5001800US3401,5001800US3402,0.012574620734980009 +5001800US3402,5001800US3402,0.9562124416629942 +5001800US3402,5001800US3401,0.04128904730206906 +5001800US3402,5001800US3403,0.002498511034936744 +5001800US3403,5001800US3403,0.5688511147176902 +5001800US3403,5001800US3404,0.3768787904672418 +5001800US3403,5001800US3402,0.05427009481506791 +5001800US3404,5001800US3404,0.5840949485209217 +5001800US3404,5001800US3403,0.3162222700610063 +5001800US3404,5001800US3406,0.09968278141807203 +5001800US3405,5001800US3405,0.8014430283325896 +5001800US3405,5001800US3407,0.1225707742628345 +5001800US3405,5001800US3409,0.0759861974045759 +5001800US3406,5001800US3406,0.9214673469750344 +5001800US3406,5001800US3404,0.038562113672227685 +5001800US3406,5001800US3403,0.03432553046124326 +5001800US3406,5001800US3412,0.005645008891494691 +5001800US3407,5001800US3407,0.7117057460687819 +5001800US3407,5001800US3412,0.12228593963095023 +5001800US3407,5001800US3410,0.10259780804811121 +5001800US3407,5001800US3411,0.06341050625215662 +5001800US3408,5001800US3408,0.8651699383579543 +5001800US3408,5001800US3411,0.05875516886231227 +5001800US3408,5001800US3409,0.03899128402994481 +5001800US3408,5001800US3410,0.03708360874978863 +5001800US3409,5001800US3409,0.8068333136385843 +5001800US3409,5001800US3405,0.1931666863614157 +5001800US3410,5001800US3410,0.7555074254785711 +5001800US3410,5001800US3411,0.13674598648723596 +5001800US3410,5001800US3407,0.05987169842445893 +5001800US3410,5001800US3408,0.04787488960973397 +5001800US3411,5001800US3411,0.7817920628097502 +5001800US3411,5001800US3407,0.08850637200364114 +5001800US3411,5001800US3410,0.08514337008192577 +5001800US3411,5001800US3409,0.02606800596743198 +5001800US3411,5001800US3405,0.018490189137250937 +5001800US3412,5001800US3412,0.8600999399935214 +5001800US3412,5001800US3403,0.07852883231640938 +5001800US3412,5001800US3407,0.04225411947130575 +5001800US3412,5001800US3406,0.019117108218763442 +5001800US3501,5001800US3501,0.7854562273666081 +5001800US3501,5001800US3502,0.2145437726333918 +5001800US3502,5001800US3502,0.653947440544708 +5001800US3502,5001800US3503,0.28274241481788653 +5001800US3502,5001800US3501,0.06331014463740553 +5001800US3503,5001800US3503,0.8652971597638904 +5001800US3503,5001800US3501,0.13295054080006238 +5001800US3503,5001800US3502,0.0017522994360471634 +5001800US3601,5001800US3601,0.7777308735322014 +5001800US3601,5001800US3602,0.2222691264677986 +5001800US3602,5001800US3602,0.8105130028090965 +5001800US3602,5001800US3603,0.13333430807948812 +5001800US3602,5001800US3604,0.05615268911141531 +5001800US3603,5001800US3603,0.7886431433183732 +5001800US3603,5001800US3601,0.21122219916322235 +5001800US3603,5001800US3614,0.00012990490010788444 +5001800US3603,5001800US3606,4.752618296629918e-06 +5001800US3604,5001800US3604,0.883769170512824 +5001800US3604,5001800US3603,0.11623082948717599 +5001800US3605,5001800US3605,0.8415889123726904 +5001800US3605,5001800US3604,0.1295387765021246 +5001800US3605,5001800US3603,0.028801614245421273 +5001800US3605,5001800US3606,7.069687976375459e-05 +5001800US3606,5001800US3606,0.7995057816525636 +5001800US3606,5001800US3607,0.08138881766120859 +5001800US3606,5001800US3605,0.06057608947824707 +5001800US3606,5001800US3603,0.05852931120798076 +5001800US3607,5001800US3607,0.551085215662451 +5001800US3607,5001800US3610,0.42588501684431435 +5001800US3607,5001800US3605,0.021521756472328075 +5001800US3607,5001800US3611,0.0008790338911286611 +5001800US3607,5001800US3608,0.000628977129777925 +5001800US3608,5001800US3608,0.7606744331212871 +5001800US3608,5001800US3607,0.12366337258642578 +5001800US3608,5001800US3609,0.050926153220319965 +5001800US3608,5001800US3605,0.0467109462122753 +5001800US3608,5001800US3610,0.01802509485969182 +5001800US3608,5001800US3611,0.0 +5001800US3609,5001800US3609,0.7456206621537796 +5001800US3609,5001800US3608,0.1622446481435032 +5001800US3609,5001800US3610,0.0921346897027172 +5001800US3610,5001800US3612,0.33220618493145404 +5001800US3610,5001800US3610,0.276788397666587 +5001800US3610,5001800US3609,0.20647622276850047 +5001800US3610,5001800US3611,0.1516949609685051 +5001800US3610,5001800US3613,0.03283423366495334 +5001800US3610,5001800US3608,0.0 +5001800US3611,5001800US3611,0.8708431322850461 +5001800US3611,5001800US3608,0.08634092972667742 +5001800US3611,5001800US3609,0.04281593798827655 +5001800US3612,5001800US3612,0.5659346745382134 +5001800US3612,5001800US3607,0.16946860446469458 +5001800US3612,5001800US3610,0.1482147669203262 +5001800US3612,5001800US3614,0.11638195407676577 +5001800US3612,5001800US3613,0.0 +5001800US3613,5001800US3613,0.9412145148753163 +5001800US3613,5001800US3615,0.058413339615789775 +5001800US3613,5001800US3612,0.00037214550889397757 +5001800US3614,5001800US3614,0.5794431989565075 +5001800US3614,5001800US3606,0.20730444707129173 +5001800US3614,5001800US3615,0.12144132393103167 +5001800US3614,5001800US3607,0.09181103004116904 +5001800US3614,5001800US3603,0.0 +5001800US3615,5001800US3615,0.5799387820555455 +5001800US3615,5001800US3614,0.338683622056787 +5001800US3615,5001800US3613,0.08137759588766759 +5001800US3616,5001800US3616,0.7157904756373399 +5001800US3616,5001800US3615,0.28231387998877133 +5001800US3616,5001800US3613,0.0018956443738888298 +5001800US3617,5001800US3617,0.7087298045166474 +5001800US3617,5001800US3616,0.2912701954833527 +5001800US3618,5001800US3618,0.74410854131465 +5001800US3618,5001800US3617,0.25589145868535 +5001800US3619,5001800US3619,0.46916598700882584 +5001800US3619,5001800US3618,0.38997189129358306 +5001800US3619,5001800US3621,0.07942348023652093 +5001800US3619,5001800US3620,0.03867677900439836 +5001800US3619,5001800US3617,0.022761862456671812 +5001800US3620,5001800US3620,0.966826774691358 +5001800US3620,5001800US3619,0.018674768518518518 +5001800US3620,5001800US3621,0.014498456790123456 +5001800US3621,5001800US3621,0.8338834368606118 +5001800US3621,5001800US3624,0.11727619117629041 +5001800US3621,5001800US3620,0.048840371963097734 +5001800US3622,5001800US3619,0.408574246313831 +5001800US3622,5001800US3622,0.31161061676333013 +5001800US3622,5001800US3621,0.23706694854743782 +5001800US3622,5001800US3624,0.024235780587885464 +5001800US3622,5001800US3623,0.01851240778751559 +5001800US3623,5001800US3623,0.7260454178274032 +5001800US3623,5001800US3619,0.1516280509440657 +5001800US3623,5001800US3624,0.1223265312285311 +5001800US3624,5001800US3622,0.7947957839262187 +5001800US3624,5001800US3624,0.20520421607378128 +5001800US3625,5001800US3625,1.0 +5001800US3626,5001800US3626,1.0 +5001800US3626,5001800US3623,0.0 +5001800US3627,5001800US3624,0.4975097244887349 +5001800US3627,5001800US3623,0.4278562834096933 +5001800US3627,5001800US3625,0.04418620936988708 +5001800US3627,5001800US3626,0.03044778273168475 +5001800US3701,5001800US3701,0.521492970029817 +5001800US3701,5001800US3704,0.34909429470483555 +5001800US3701,5001800US3713,0.06587311876112244 +5001800US3701,5001800US3703,0.06353961650422506 +5001800US3702,5001800US3713,0.5915759434296931 +5001800US3702,5001800US3702,0.16855733629744082 +5001800US3702,5001800US3701,0.13473024782356202 +5001800US3702,5001800US3704,0.10513647244930403 +5001800US3703,5001800US3703,0.7100253896113005 +5001800US3703,5001800US3701,0.28997461038869954 +5001800US3704,5001800US3702,0.7875290965786815 +5001800US3704,5001800US3704,0.21247090342131847 +5001800US3704,5001800US3713,0.0 +5001800US3705,5001800US3710,0.4431789962096458 +5001800US3705,5001800US3705,0.3711018167559796 +5001800US3705,5001800US3706,0.15662821853352504 +5001800US3705,5001800US3711,0.02909096850084956 +5001800US3706,5001800US3709,0.49629613747708534 +5001800US3706,5001800US3705,0.31303087802803714 +5001800US3706,5001800US3713,0.17451213716262332 +5001800US3706,5001800US3704,0.012183617242525273 +5001800US3706,5001800US3706,0.003977230089728884 +5001800US3707,5001800US3707,0.7133882206152413 +5001800US3707,5001800US3701,0.22876632486593035 +5001800US3707,5001800US3703,0.03617171459784961 +5001800US3707,5001800US3713,0.021673739920978714 +5001800US3708,5001800US3706,0.3366908737138983 +5001800US3708,5001800US3709,0.3030188382582718 +5001800US3708,5001800US3708,0.21712640402511768 +5001800US3708,5001800US3707,0.14316388400271224 +5001800US3709,5001800US3708,0.7445891182383714 +5001800US3709,5001800US3707,0.13599570539877687 +5001800US3709,5001800US3712,0.09601381009020768 +5001800US3709,5001800US3714,0.022401397850594192 +5001800US3709,5001800US3709,0.00099996842204983 +5001800US3710,5001800US3714,0.44833967154996746 +5001800US3710,5001800US3710,0.38225843236217394 +5001800US3710,5001800US3711,0.16940189608785863 +5001800US3711,5001800US3711,0.7531361162748712 +5001800US3711,5001800US3705,0.1277896704077199 +5001800US3711,5001800US3714,0.11907421331740889 +5001800US3712,5001800US3712,0.873563091049287 +5001800US3712,5001800US3714,0.12120554112861055 +5001800US3712,5001800US3708,0.0052313678221024736 +5001800US3713,5001800US3706,0.515749821393778 +5001800US3713,5001800US3710,0.26001402204558666 +5001800US3713,5001800US3705,0.1766078306962895 +5001800US3713,5001800US3709,0.047628325864345865 +5001800US3800,5001800US3800,1.0 +5001800US3901,5001800US3901,0.6711940673083381 +5001800US3901,5001800US3908,0.3288059326916619 +5001800US3902,5001800US3902,0.524744787922358 +5001800US3902,5001800US3901,0.36714234363767073 +5001800US3902,5001800US3908,0.10811286843997124 +5001800US3903,5001800US3903,0.7221931735657225 +5001800US3903,5001800US3915,0.2778068264342774 +5001800US3904,5001800US3904,0.5038674881480202 +5001800US3904,5001800US3905,0.3343569861103114 +5001800US3904,5001800US3909,0.11068405596821558 +5001800US3904,5001800US3915,0.051091469773452844 +5001800US3905,5001800US3905,0.5013060509523489 +5001800US3905,5001800US3909,0.45066168756797337 +5001800US3905,5001800US3904,0.04803226147967777 +5001800US3906,5001800US3906,0.6051412365923513 +5001800US3906,5001800US3902,0.2667525411355085 +5001800US3906,5001800US3912,0.12810622227214016 +5001800US3907,5001800US3912,0.2310239411946553 +5001800US3907,5001800US3906,0.2139061587606911 +5001800US3907,5001800US3913,0.20307251056229175 +5001800US3907,5001800US3904,0.16239652388967127 +5001800US3907,5001800US3905,0.10656751279497131 +5001800US3907,5001800US3907,0.08303335279771923 +5001800US3908,5001800US3908,0.6526831903336879 +5001800US3908,5001800US3915,0.1947346967181333 +5001800US3908,5001800US3910,0.1255066296822856 +5001800US3908,5001800US3905,0.027075483265893176 +5001800US3909,5001800US3909,0.3935681798678452 +5001800US3909,5001800US3911,0.2864348236505621 +5001800US3909,5001800US3907,0.19962456019909036 +5001800US3909,5001800US3905,0.12037243628250235 +5001800US3910,5001800US3910,0.9608606430757323 +5001800US3910,5001800US3915,0.039139356924267704 +5001800US3911,5001800US3911,0.773182304717457 +5001800US3911,5001800US3913,0.17935194944204988 +5001800US3911,5001800US3907,0.04746574584049311 +5001800US3912,5001800US3904,0.4063692930988564 +5001800US3912,5001800US3912,0.29986602855367334 +5001800US3912,5001800US3903,0.24525248775291145 +5001800US3912,5001800US3915,0.04851219059455882 +5001800US3913,5001800US3914,0.3893675468313271 +5001800US3913,5001800US3913,0.3339660267653348 +5001800US3913,5001800US3906,0.27666642640333805 +5001800US3914,5001800US3914,0.6613176044166261 +5001800US3914,5001800US3913,0.20643028543620895 +5001800US3914,5001800US3907,0.06744416290503918 +5001800US3914,5001800US3911,0.06480794724212575 +5001800US3915,5001800US3912,0.3399033912242208 +5001800US3915,5001800US3902,0.2715417322639428 +5001800US3915,5001800US3915,0.2244335907639035 +5001800US3915,5001800US3903,0.16412128574793292 +5001800US3916,5001800US3907,0.7033591519716375 +5001800US3916,5001800US3913,0.2373600205030363 +5001800US3916,5001800US3914,0.03914620515850698 +5001800US3916,5001800US3906,0.020134622366819253 +5001800US4001,5001800US4001,0.8842488103200041 +5001800US4001,5001800US4002,0.11561434516042611 +5001800US4001,5001800US4003,0.00013684451956973 +5001800US4002,5001800US4002,0.9971427986045532 +5001800US4002,5001800US4001,0.002857201395446731 +5001800US4003,5001800US4003,0.8040516338653957 +5001800US4003,5001800US4005,0.15793542830858967 +5001800US4003,5001800US4001,0.03801293782601462 +5001800US4004,5001800US4004,0.9535074377312492 +5001800US4004,5001800US4003,0.03701929818883963 +5001800US4004,5001800US4005,0.009473264079911115 +5001800US4005,5001800US4005,0.6984205169252319 +5001800US4005,5001800US4003,0.25506467818681733 +5001800US4005,5001800US4004,0.04651480488795079 +5001800US4101,5001800US4101,0.6319223670158587 +5001800US4101,5001800US4106,0.36793770924258046 +5001800US4101,5001800US4105,0.00013992374156084934 +5001800US4101,5001800US4103,0.0 +5001800US4102,5001800US4102,0.9444103440623628 +5001800US4102,5001800US4105,0.032057244573711126 +5001800US4102,5001800US4103,0.023532411363926097 +5001800US4103,5001800US4103,0.820860213343091 +5001800US4103,5001800US4105,0.10341333036360599 +5001800US4103,5001800US4101,0.075726456293303 +5001800US4104,5001800US4104,0.7968913676393032 +5001800US4104,5001800US4105,0.12964253978803794 +5001800US4104,5001800US4102,0.07346609257265878 +5001800US4105,5001800US4106,0.47915371509797267 +5001800US4105,5001800US4105,0.41577139823175385 +5001800US4105,5001800US4104,0.07165227852397482 +5001800US4105,5001800US4101,0.025655027336911048 +5001800US4105,5001800US4103,0.007767580809387633 +5001800US4105,5001800US4102,0.0 +5001800US4201,5001800US4201,0.9704663963944842 +5001800US4201,5001800US4204,0.029533603605515817 +5001800US4202,5001800US4202,0.9980442399551832 +5001800US4202,5001800US4203,0.001955760044816739 +5001800US4203,5001800US4203,0.9651450324415789 +5001800US4203,5001800US4202,0.03485496755842115 +5001800US4204,5001800US4204,0.8132087976394857 +5001800US4204,5001800US4205,0.11333182922229233 +5001800US4204,5001800US4201,0.07345937313822193 +5001800US4205,5001800US4205,0.9352211556887271 +5001800US4205,5001800US4203,0.06179816385903176 +5001800US4205,5001800US4204,0.002980680452241172 +5001800US4206,5001800US4206,0.9999884570322645 +5001800US4206,5001800US4205,1.1542967735481352e-05 +5001800US4207,5001800US4207,0.9560041017339772 +5001800US4207,5001800US4208,0.04399589826602283 +5001800US4208,5001800US4208,1.0 +5001800US4209,5001800US4209,0.8270239211179662 +5001800US4209,5001800US4204,0.09447253130846436 +5001800US4209,5001800US4207,0.05684158093527296 +5001800US4209,5001800US4208,0.014906084412248052 +5001800US4209,5001800US4206,0.006755882226048486 +5001800US4210,5001800US4210,0.9981201533812293 +5001800US4210,5001800US4211,0.0018798466187706694 +5001800US4211,5001800US4211,0.9940133511133313 +5001800US4211,5001800US4210,0.005986648886668627 +5001800US4212,5001800US4209,0.41404264144225494 +5001800US4212,5001800US4215,0.4093864763823965 +5001800US4212,5001800US4213,0.17657088217534858 +5001800US4213,5001800US4213,0.803400098291927 +5001800US4213,5001800US4214,0.16630191669257696 +5001800US4213,5001800US4210,0.030297985015496024 +5001800US4214,5001800US4214,0.8417178002193104 +5001800US4214,5001800US4212,0.15828219978068955 +5001800US4215,5001800US4215,0.6702230151650312 +5001800US4215,5001800US4213,0.146158616881186 +5001800US4215,5001800US4214,0.1102858842020305 +5001800US4215,5001800US4216,0.07333248375175226 +5001800US4216,5001800US4216,1.0 +5001800US4217,5001800US4217,0.9754036675840521 +5001800US4217,5001800US4216,0.02459633241594798 +5001800US4218,5001800US4212,0.9361369308404336 +5001800US4218,5001800US4217,0.06386306915956647 +5001800US4401,5001800US4401,0.9892810219452863 +5001800US4401,5001800US4402,0.010718978054713618 +5001800US4402,5001800US4402,0.9957674643078059 +5001800US4402,5001800US4401,0.004232535692194053 +5001800US4501,5001800US4501,0.8144612879153154 +5001800US4501,5001800US4506,0.18553871208468461 +5001800US4502,5001800US4502,0.9552158458685349 +5001800US4502,5001800US4506,0.044784154131465075 +5001800US4503,5001800US4503,0.9695316285950909 +5001800US4503,5001800US4504,0.030468371404909093 +5001800US4504,5001800US4504,0.9457216266573607 +5001800US4504,5001800US4505,0.04442757407486771 +5001800US4504,5001800US4503,0.00985079926777154 +5001800US4505,5001800US4505,0.9358826486854168 +5001800US4505,5001800US4503,0.04183548733721744 +5001800US4505,5001800US4506,0.022281863977365708 +5001800US4506,5001800US4506,0.8276732845698362 +5001800US4506,5001800US4501,0.12874787012718047 +5001800US4506,5001800US4502,0.03427800841593945 +5001800US4506,5001800US4507,0.008498310222448154 +5001800US4506,5001800US4505,0.0008025266645956301 +5001800US4507,5001800US4507,0.9991042095720993 +5001800US4507,5001800US4506,0.0008957904279006572 +5001800US4600,5001800US4600,1.0 +5001800US4701,5001800US4701,1.0 +5001800US4702,5001800US4702,0.9579557293782648 +5001800US4702,5001800US4701,0.04204427062173518 +5001800US4703,5001800US4703,0.9350354507888885 +5001800US4703,5001800US4702,0.04586915262987514 +5001800US4703,5001800US4706,0.019095396581236365 +5001800US4704,5001800US4704,0.7687311547065842 +5001800US4704,5001800US4705,0.14059506960476245 +5001800US4704,5001800US4703,0.07852941701917332 +5001800US4704,5001800US4706,0.012144358669480102 +5001800US4705,5001800US4705,0.3956991032305144 +5001800US4705,5001800US4707,0.34082255169480097 +5001800US4705,5001800US4706,0.2634783450746847 +5001800US4706,5001800US4706,0.7452178436822422 +5001800US4706,5001800US4707,0.11167944818754638 +5001800US4706,5001800US4705,0.11013147746620294 +5001800US4706,5001800US4704,0.0329712306640085 +5001800US4707,5001800US4707,0.4736478313461206 +5001800US4707,5001800US4705,0.22107563606054137 +5001800US4707,5001800US4708,0.17320363085617865 +5001800US4707,5001800US4704,0.13207290173715935 +5001800US4708,5001800US4708,0.9002346736875378 +5001800US4708,5001800US4709,0.09958526977470926 +5001800US4708,5001800US4707,0.0001800565377528544 +5001800US4709,5001800US4709,0.9920755629102133 +5001800US4709,5001800US4708,0.007924437089786735 +5001800US4801,5001800US4801,0.7893897761480967 +5001800US4801,5001800US4817,0.18520767492239737 +5001800US4801,5001800US4805,0.025402548929505876 +5001800US4802,5001800US4802,0.359408487885635 +5001800US4802,5001800US4838,0.35927425669478064 +5001800US4802,5001800US4818,0.1757354750665563 +5001800US4802,5001800US4807,0.07756325644869012 +5001800US4802,5001800US4808,0.028018523904337903 +5001800US4803,5001800US4803,0.5176007187488897 +5001800US4803,5001800US4804,0.44218792034881654 +5001800US4803,5001800US4832,0.040211360902293804 +5001800US4804,5001800US4804,0.5595153906279079 +5001800US4804,5001800US4801,0.2735209550749991 +5001800US4804,5001800US4803,0.1612014739271225 +5001800US4804,5001800US4805,0.005762180369970596 +5001800US4805,5001800US4805,0.4807568300439261 +5001800US4805,5001800US4832,0.2446883559945106 +5001800US4805,5001800US4806,0.2341394723353597 +5001800US4805,5001800US4824,0.02817444789215227 +5001800US4805,5001800US4830,0.01224089373405129 +5001800US4806,5001800US4825,0.48740579826057345 +5001800US4806,5001800US4806,0.40433282851606944 +5001800US4806,5001800US4833,0.07654560512282023 +5001800US4806,5001800US4830,0.019458254536260412 +5001800US4806,5001800US4812,0.01225751356427648 +5001800US4807,5001800US4838,0.47138783269961976 +5001800US4807,5001800US4807,0.3573855182674822 +5001800US4807,5001800US4809,0.11178293932881468 +5001800US4807,5001800US4808,0.03407174739626385 +5001800US4807,5001800US4822,0.025371962307819473 +5001800US4808,5001800US4802,0.3594753042668233 +5001800US4808,5001800US4817,0.3366527611834511 +5001800US4808,5001800US4808,0.14210931329863713 +5001800US4808,5001800US4810,0.1356544014904518 +5001800US4808,5001800US4838,0.02610821976063668 +5001800US4809,5001800US4809,0.7790925827668356 +5001800US4809,5001800US4807,0.19671033655130174 +5001800US4809,5001800US4822,0.024197080681862694 +5001800US4810,5001800US4810,0.37407784444107517 +5001800US4810,5001800US4838,0.23814680846455413 +5001800US4810,5001800US4837,0.19529121271849076 +5001800US4810,5001800US4822,0.10530442504710953 +5001800US4810,5001800US4808,0.0803092984470099 +5001800US4810,5001800US4802,0.003942039030518313 +5001800US4810,5001800US4827,0.0029283718512421756 +5001800US4810,5001800US4835,0.0 +5001800US4811,5001800US4811,0.7644499701265127 +5001800US4811,5001800US4825,0.1508330739141616 +5001800US4811,5001800US4819,0.08471695595932566 +5001800US4812,5001800US4812,0.8362937978608432 +5001800US4812,5001800US4824,0.11262287629205561 +5001800US4812,5001800US4833,0.022888814480278543 +5001800US4812,5001800US4825,0.0165105446982504 +5001800US4812,5001800US4826,0.011683966668572254 +5001800US4813,5001800US4813,0.8933847583643123 +5001800US4813,5001800US4826,0.07749256505576207 +5001800US4813,5001800US4819,0.016944237918215612 +5001800US4813,5001800US4825,0.012178438661710037 +5001800US4814,5001800US4814,0.9115498013075247 +5001800US4814,5001800US4836,0.08567210934495577 +5001800US4814,5001800US4822,0.002778089347519549 +5001800US4815,5001800US4815,0.5363008110572539 +5001800US4815,5001800US4834,0.2530354657580266 +5001800US4815,5001800US4828,0.2106637231847195 +5001800US4816,5001800US4816,0.9752759215156578 +5001800US4816,5001800US4823,0.024724078484342233 +5001800US4817,5001800US4817,0.7427373353057313 +5001800US4817,5001800US4810,0.23411768768958907 +5001800US4817,5001800US4806,0.014758448405868842 +5001800US4817,5001800US4837,0.008386528598810805 +5001800US4818,5001800US4818,0.7611156199003413 +5001800US4818,5001800US4829,0.14345803771674914 +5001800US4818,5001800US4809,0.05516140010995628 +5001800US4818,5001800US4802,0.035727064080076094 +5001800US4818,5001800US4807,0.004537878192877276 +5001800US4819,5001800US4819,0.9708470161442841 +5001800US4819,5001800US4825,0.029152983855715832 +5001800US4820,5001800US4820,0.8164786861998101 +5001800US4820,5001800US4823,0.12017930752683004 +5001800US4820,5001800US4835,0.030393346873535315 +5001800US4820,5001800US4828,0.016710932711605717 +5001800US4820,5001800US4821,0.016237726688218823 +5001800US4821,5001800US4821,0.7765707422880009 +5001800US4821,5001800US4837,0.10765738483121347 +5001800US4821,5001800US4835,0.06671481004074069 +5001800US4821,5001800US4820,0.04238946436613812 +5001800US4821,5001800US4823,0.005448484001387357 +5001800US4821,5001800US4810,0.0012191144725194514 +5001800US4822,5001800US4822,0.6477084119200301 +5001800US4822,5001800US4836,0.12348642021878536 +5001800US4822,5001800US4809,0.09349302150132026 +5001800US4822,5001800US4807,0.08792436816295737 +5001800US4822,5001800US4829,0.03448698604300264 +5001800US4822,5001800US4814,0.012900792153904187 +5001800US4823,5001800US4823,0.7146581769317389 +5001800US4823,5001800US4820,0.1716762170236079 +5001800US4823,5001800US4828,0.05921161818321338 +5001800US4823,5001800US4816,0.04323269023829197 +5001800US4823,5001800US4821,0.011221297623147853 +5001800US4824,5001800US4824,0.5527587741059345 +5001800US4824,5001800US4832,0.1824468964810402 +5001800US4824,5001800US4833,0.12638159994223194 +5001800US4824,5001800US4826,0.0776724131906835 +5001800US4824,5001800US4806,0.060740316280109855 +5001800US4825,5001800US4831,0.3023374758849657 +5001800US4825,5001800US4825,0.2401274171115797 +5001800US4825,5001800US4837,0.19086245831277013 +5001800US4825,5001800US4806,0.09325377241389624 +5001800US4825,5001800US4835,0.09159376074894941 +5001800US4825,5001800US4811,0.04402769677120254 +5001800US4825,5001800US4821,0.032500336488851006 +5001800US4825,5001800US4810,0.005297082267785305 +5001800US4826,5001800US4826,0.6296421569031847 +5001800US4826,5001800US4824,0.1817924328545776 +5001800US4826,5001800US4813,0.14163029938044994 +5001800US4826,5001800US4804,0.04693511086178781 +5001800US4827,5001800US4827,0.8794279948976444 +5001800US4827,5001800US4822,0.11994515564771459 +5001800US4827,5001800US4810,0.0006268494546409745 +5001800US4828,5001800US4828,0.6541677607453619 +5001800US4828,5001800US4835,0.2067333375587407 +5001800US4828,5001800US4815,0.1198016171237652 +5001800US4828,5001800US4821,0.019297284572132232 +5001800US4829,5001800US4829,0.8676507112362517 +5001800US4829,5001800US4809,0.05858777578215073 +5001800US4829,5001800US4836,0.054501533321307284 +5001800US4829,5001800US4818,0.01707176152866775 +5001800US4829,5001800US4802,0.002188218131622497 +5001800US4830,5001800US4830,0.81055350194274 +5001800US4830,5001800US4832,0.16845001703655557 +5001800US4830,5001800US4833,0.0170451454406455 +5001800US4830,5001800US4805,0.0039513355800589265 +5001800US4831,5001800US4831,0.5523763873013532 +5001800US4831,5001800US4811,0.18186091958261125 +5001800US4831,5001800US4837,0.139949977262392 +5001800US4831,5001800US4817,0.07882268352936839 +5001800US4831,5001800US4810,0.04699003232427516 +5001800US4832,5001800US4832,0.41153177123086754 +5001800US4832,5001800US4805,0.3357794521236251 +5001800US4832,5001800US4824,0.21257067385109382 +5001800US4832,5001800US4830,0.03093020306694007 +5001800US4832,5001800US4803,0.009187899727473496 +5001800US4833,5001800US4833,0.7618571219504003 +5001800US4833,5001800US4806,0.1902009754880439 +5001800US4833,5001800US4825,0.021347497822785538 +5001800US4833,5001800US4812,0.012749000160742005 +5001800US4833,5001800US4830,0.009347027592444646 +5001800US4833,5001800US4824,0.004498376985583602 +5001800US4834,5001800US4834,0.7260866211307312 +5001800US4834,5001800US4827,0.21776602757401928 +5001800US4834,5001800US4815,0.05614735129524958 +5001800US4835,5001800US4835,0.4980094714394129 +5001800US4835,5001800US4828,0.27394251845319606 +5001800US4835,5001800US4821,0.13928400327303098 +5001800US4835,5001800US4837,0.056425867120029505 +5001800US4835,5001800US4827,0.018898362212602864 +5001800US4835,5001800US4820,0.013439777501727668 +5001800US4836,5001800US4836,0.7846067785752631 +5001800US4836,5001800US4814,0.1150296162296036 +5001800US4836,5001800US4802,0.057854278225642736 +5001800US4836,5001800US4808,0.04162963698126486 +5001800US4836,5001800US4829,0.0008796899882256878 +5001800US4901,5001800US4901,0.9091577680566965 +5001800US4901,5001800US4903,0.08594433513534165 +5001800US4901,5001800US4902,0.004897896807961948 +5001800US4902,5001800US4902,0.850299662104018 +5001800US4902,5001800US4901,0.1264724400185614 +5001800US4902,5001800US4904,0.023223540740321163 +5001800US4902,5001800US4903,4.357137099497404e-06 +5001800US4903,5001800US4903,0.8580554067686578 +5001800US4903,5001800US4904,0.1419445932313423 +5001800US4904,5001800US4904,0.6890885107218001 +5001800US4904,5001800US4902,0.16057517251905068 +5001800US4904,5001800US4903,0.11050594686522255 +5001800US4904,5001800US4901,0.03983036989392667 +5001800US5000,5001800US5000,1.0 +5001800US5101,5001800US5107,0.4138336030231669 +5001800US5101,5001800US5101,0.3760757059304695 +5001800US5101,5001800US5110,0.18039576075309813 +5001800US5101,5001800US5105,0.029694930293265413 +5001800US5102,5001800US5102,0.637406523986961 +5001800US5102,5001800US5103,0.19848895092520033 +5001800US5102,5001800US5101,0.16410452508783874 +5001800US5103,5001800US5103,0.8910319597934137 +5001800US5103,5001800US5102,0.10896804020658633 +5001800US5104,5001800US5104,0.826274517976534 +5001800US5104,5001800US5102,0.1736402268167411 +5001800US5104,5001800US5101,8.525520672493071e-05 +5001800US5104,5001800US5103,0.0 +5001800US5105,5001800US5105,0.7519580459694037 +5001800US5105,5001800US5109,0.1122738043233655 +5001800US5105,5001800US5110,0.09247051348238751 +5001800US5105,5001800US5104,0.022080139787935348 +5001800US5105,5001800US5107,0.021217496436907898 +5001800US5106,5001800US5106,0.7818803439226134 +5001800US5106,5001800US5105,0.16303469094355702 +5001800US5106,5001800US5109,0.055084965133829544 +5001800US5107,5001800US5101,0.36917190007713147 +5001800US5107,5001800US5107,0.2482688386662914 +5001800US5107,5001800US5104,0.24557675656745567 +5001800US5107,5001800US5105,0.13698250468912146 +5001800US5108,5001800US5108,0.8917904053884835 +5001800US5108,5001800US5111,0.10820959461151643 +5001800US5109,5001800US5109,0.8736352440574401 +5001800US5109,5001800US5106,0.12636475594255986 +5001800US5110,5001800US5110,0.6431140976040182 +5001800US5110,5001800US5111,0.2229155152185085 +5001800US5110,5001800US5106,0.11234417068259463 +5001800US5110,5001800US5108,0.021626216494878683 +5001800US5111,5001800US5111,0.7142771189258514 +5001800US5111,5001800US5107,0.19590702534952578 +5001800US5111,5001800US5108,0.08625647196668619 +5001800US5111,5001800US5110,0.003559383757936571 +5001800US5301,5001800US5301,0.6544521645742007 +5001800US5301,5001800US5308,0.2143462379510508 +5001800US5301,5001800US5302,0.13046452131906006 +5001800US5301,5001800US5307,0.0007370761556884516 +5001800US5302,5001800US5302,0.7274945610630916 +5001800US5302,5001800US5301,0.2674738973027913 +5001800US5302,5001800US5308,0.005031541634117044 +5001800US5303,5001800US5303,0.9919913928719961 +5001800US5303,5001800US5304,0.007950744082599498 +5001800US5303,5001800US5310,5.786304540440844e-05 +5001800US5304,5001800US5304,0.9752913669888389 +5001800US5304,5001800US5305,0.024708633011161118 +5001800US5305,5001800US5305,1.0 +5001800US5306,5001800US5306,0.9678031875376516 +5001800US5306,5001800US5310,0.03219681246234835 +5001800US5307,5001800US5307,0.93133559866906 +5001800US5307,5001800US5302,0.06398800652663977 +5001800US5307,5001800US5309,0.00374244060117506 +5001800US5307,5001800US5301,0.0009339542031251035 +5001800US5308,5001800US5308,0.7489307823424434 +5001800US5308,5001800US5309,0.1608098906527457 +5001800US5308,5001800US5310,0.039252860518832026 +5001800US5308,5001800US5304,0.03868542971201218 +5001800US5308,5001800US5306,0.01232103677396672 +5001800US5309,5001800US5309,0.856215398176304 +5001800US5309,5001800US5301,0.05388375598456842 +5001800US5309,5001800US5306,0.04078059500546384 +5001800US5309,5001800US5307,0.03738154126660535 +5001800US5309,5001800US5308,0.01173870956705836 +5001800US5310,5001800US5310,0.9179232212990777 +5001800US5310,5001800US5306,0.04179782992767838 +5001800US5310,5001800US5303,0.029684911599845782 +5001800US5310,5001800US5308,0.010594037173398184 +5001800US5401,5001800US5402,0.9959840131183196 +5001800US5401,5001800US5401,0.004015986881680378 +5001800US5402,5001800US5401,0.6415205133288331 +5001800US5402,5001800US5402,0.3584794866711669 +5001800US5403,5001800US5401,1.0 +5001800US5501,5001800US5501,0.8404202053791578 +5001800US5501,5001800US5505,0.15536829376588288 +5001800US5501,5001800US5502,0.004211500854959236 +5001800US5502,5001800US5502,0.9055727408456167 +5001800US5502,5001800US5501,0.08796755651533779 +5001800US5502,5001800US5503,0.0064597026390455226 +5001800US5503,5001800US5503,0.9898182054912683 +5001800US5503,5001800US5507,0.01018179450873167 +5001800US5504,5001800US5504,0.9340872822066614 +5001800US5504,5001800US5501,0.06485038713631118 +5001800US5504,5001800US5505,0.0010623306570274028 +5001800US5505,5001800US5505,0.8350322664299464 +5001800US5505,5001800US5504,0.14214019024061014 +5001800US5505,5001800US5501,0.02017607729314409 +5001800US5505,5001800US5506,0.002651466036299318 +5001800US5506,5001800US5506,0.9870854663832449 +5001800US5506,5001800US5505,0.010109862174732536 +5001800US5506,5001800US5504,0.0027695503327306997 +5001800US5506,5001800US5508,3.512110929187482e-05 +5001800US5507,5001800US5507,0.9930117910625982 +5001800US5507,5001800US5503,0.006988208937401796 +5001800US5508,5001800US5508,0.976997827441602 +5001800US5508,5001800US5506,0.023002172558398034 +5001800US5600,5001800US5600,1.0 diff --git a/policyengine_us_data/storage/upload_completed_datasets.py b/policyengine_us_data/storage/upload_completed_datasets.py index 650b9873..e302d65a 100644 --- a/policyengine_us_data/storage/upload_completed_datasets.py +++ b/policyengine_us_data/storage/upload_completed_datasets.py @@ -15,6 +15,7 @@ def upload_datasets(): Pooled_3_Year_CPS_2023.file_path, CPS_2023.file_path, STORAGE_FOLDER / "small_enhanced_cps_2024.h5", + STORAGE_FOLDER / "policy_data.db", ] # Filter to only existing files From a1de133849fe7369a9a3064cc3258a2ed9328c9b Mon Sep 17 00:00:00 2001 From: Ben Ogorek Date: Fri, 15 Aug 2025 09:23:43 -0400 Subject: [PATCH 17/27] Store policy database in storage folder --- policyengine_us_data/db/create_database_tables.py | 6 +++++- policyengine_us_data/db/create_initial_strata.py | 4 +++- policyengine_us_data/db/etl_age.py | 4 +++- policyengine_us_data/db/etl_irs_soi.py | 4 +++- policyengine_us_data/db/etl_medicaid.py | 4 +++- policyengine_us_data/db/etl_snap.py | 6 ++++-- 6 files changed, 21 insertions(+), 7 deletions(-) diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py index ec42fb61..cf0213ef 100644 --- a/policyengine_us_data/db/create_database_tables.py +++ b/policyengine_us_data/db/create_database_tables.py @@ -12,6 +12,8 @@ create_engine, ) +from policyengine_us_data.storage import STORAGE_FOLDER + logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", @@ -176,7 +178,9 @@ def calculate_definition_hash(mapper, connection, target: Stratum): ) -def create_database(db_uri="sqlite:///policy_data.db"): +def create_database( + db_uri: str = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}", +): """ Creates a SQLite database and all the defined tables. diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py index 068bca30..5653948b 100644 --- a/policyengine_us_data/db/create_initial_strata.py +++ b/policyengine_us_data/db/create_initial_strata.py @@ -3,6 +3,8 @@ import pandas as pd from sqlmodel import Session, create_engine +from policyengine_us_data.storage import STORAGE_FOLDER + from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import ( UCGID, @@ -32,7 +34,7 @@ def main(): .reset_index(drop=True) ) - DATABASE_URL = "sqlite:///policy_data.db" + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}" engine = create_engine(DATABASE_URL) # map the ucgid_str 'code' to auto-generated 'stratum_id' diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index bc540373..f8b3e0a6 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -2,6 +2,8 @@ import numpy as np from sqlmodel import Session, create_engine +from policyengine_us_data.storage import STORAGE_FOLDER + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -103,7 +105,7 @@ def load_age_data(df_long, geo, year, stratum_lookup=None): raise ValueError('geo must be one of "National", "State", "District"') # Prepare to load data ----------- - DATABASE_URL = "sqlite:///policy_data.db" + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}" engine = create_engine(DATABASE_URL) if stratum_lookup is None: diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index 5e28e464..ecbec177 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -5,6 +5,8 @@ from sqlmodel import Session, create_engine +from policyengine_us_data.storage import STORAGE_FOLDER + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -283,7 +285,7 @@ def transform_soi_data(raw_df): def load_soi_data(long_dfs, year): - DATABASE_URL = "sqlite:///policy_data.db" + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}" engine = create_engine(DATABASE_URL) session = Session(engine) diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py index 4ff96278..926a0d88 100644 --- a/policyengine_us_data/db/etl_medicaid.py +++ b/policyengine_us_data/db/etl_medicaid.py @@ -3,6 +3,8 @@ import pandas as pd from sqlmodel import Session, create_engine +from policyengine_us_data.storage import STORAGE_FOLDER + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -83,7 +85,7 @@ def transform_medicaid_data(state_admin_df, cd_survey_df, year): def load_medicaid_data(long_state, long_cd, year): - DATABASE_URL = "sqlite:///policy_data.db" + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}" engine = create_engine(DATABASE_URL) stratum_lookup = {} diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py index a60c0074..1fba44a4 100644 --- a/policyengine_us_data/db/etl_snap.py +++ b/policyengine_us_data/db/etl_snap.py @@ -7,6 +7,8 @@ import us from sqlmodel import Session, create_engine +from policyengine_us_data.storage import STORAGE_FOLDER + from policyengine_us_data.db.create_database_tables import ( Stratum, StratumConstraint, @@ -144,7 +146,7 @@ def transform_survey_snap_data(raw_df): def load_administrative_snap_data(df_states, year): - DATABASE_URL = "sqlite:///policy_data.db" + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}" engine = create_engine(DATABASE_URL) stratum_lookup = {} @@ -232,7 +234,7 @@ def load_survey_snap_data(survey_df, year, stratum_lookup=None): if stratum_lookup is None: raise ValueError("stratum_lookup must be provided") - DATABASE_URL = "sqlite:///policy_data.db" + DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}" engine = create_engine(DATABASE_URL) with Session(engine) as session: From b3767264e3e15413e3a5c018f42c08cdb0fa61d0 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 15 Aug 2025 09:35:21 -0400 Subject: [PATCH 18/27] adding make database to reusable test. Updating changelog_entry --- .github/workflows/reusable_test.yaml | 6 +++++- changelog_entry.yaml | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/reusable_test.yaml b/.github/workflows/reusable_test.yaml index f9c5b49a..dce1daf4 100644 --- a/.github/workflows/reusable_test.yaml +++ b/.github/workflows/reusable_test.yaml @@ -58,6 +58,10 @@ jobs: if: inputs.full_suite run: make download + - name: Create and load calibration targets database + if: inputs.full_suite + run: make database + - name: Build datasets if: inputs.full_suite run: make data @@ -90,4 +94,4 @@ jobs: with: branch: gh-pages folder: docs/_build/html - clean: true \ No newline at end of file + clean: true diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 5bd54961..b3a3fb5d 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,4 @@ - bump: minor changes: added: - - load script for eitc targets + - add SQLite database for calibration targets From 9078ed9a578e805e36d08bdb376937d0c1f4409d Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 15 Aug 2025 09:39:16 -0400 Subject: [PATCH 19/27] removing TODOs --- policyengine_us_data/db/etl_age.py | 1 - policyengine_us_data/utils/census.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index f8b3e0a6..bb83067c 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -134,7 +134,6 @@ def load_age_data(df_long, geo, year, stratum_lookup=None): ) # Create constraints and link them to the parent's relationship attribute. - # TODO: greater_than_or_equal_to to just greater than! new_stratum.constraints_rel = [ StratumConstraint( constraint_variable="ucgid_str", diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py index fb577e60..2f424ccb 100644 --- a/policyengine_us_data/utils/census.py +++ b/policyengine_us_data/utils/census.py @@ -127,8 +127,8 @@ def get_census_docs(year): docs_url = ( f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json" ) - # TODO: Alternative: incorporate it! - "https://api.census.gov/data/2023/acs/acs1/variables.json" + # NOTE: The URL for detail tables, should we ever need it is: + # "https://api.census.gov/data/2023/acs/acs1/variables.json" docs_response = requests.get(docs_url) docs_response.raise_for_status() From 9913e3c5f271c59e278d69e9a762c56d8d848b0e Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 15 Aug 2025 10:00:41 -0400 Subject: [PATCH 20/27] Removed troublesome logging. Updated Makefile --- Makefile | 4 +--- policyengine_us_data/db/create_database_tables.py | 3 --- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 01999135..21a39686 100644 --- a/Makefile +++ b/Makefile @@ -67,9 +67,6 @@ database: python policyengine_us_data/db/etl_snap.py python policyengine_us_data/db/etl_irs_soi.py -clean-database: - rm *.db - data: python policyengine_us_data/utils/uprating.py python policyengine_us_data/datasets/acs/acs.py @@ -84,6 +81,7 @@ data: clean: rm -f policyengine_us_data/storage/*.h5 + rm -f policyengine_us_data/storage/*.db git clean -fX -- '*.csv' rm -rf policyengine_us_data/docs/_build diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py index cf0213ef..d6cfc8ec 100644 --- a/policyengine_us_data/db/create_database_tables.py +++ b/policyengine_us_data/db/create_database_tables.py @@ -173,9 +173,6 @@ def calculate_definition_hash(mapper, connection, target: Stratum): fingerprint_text = "\n".join(constraint_strings) h = hashlib.sha256(fingerprint_text.encode("utf-8")) target.definition_hash = h.hexdigest() - logger.info( - f"Set definition_hash for Stratum to '{target.definition_hash}'" - ) def create_database( From fddc3acafa14f0d95170b7752eb6fc624658a757 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 15 Aug 2025 16:14:56 -0400 Subject: [PATCH 21/27] updated comments based on feedback. Removed old make target --- Makefile | 6 ------ policyengine_us_data/db/etl_irs_soi.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index 21a39686..795a77b6 100644 --- a/Makefile +++ b/Makefile @@ -22,12 +22,6 @@ changelog: download: python policyengine_us_data/storage/download_private_prerequisites.py -targets: - python policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py - python policyengine_us_data/storage/calibration_targets/pull_age_targets.py - python policyengine_us_data/storage/calibration_targets/pull_soi_targets.py - python policyengine_us_data/storage/calibration_targets/pull_snap_targets.py - upload: python policyengine_us_data/storage/upload_completed_datasets.py diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index ecbec177..74abab9e 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -27,8 +27,8 @@ """See the 22incddocguide.docx manual from the IRS SOI""" # Let's make this work with strict inequalities -# Interpret Language: '$10,000 under $25,000' -epsilon = 0.005 # Half a penny +# Language in the doc: '$10,000 under $25,000' +epsilon = 0.005 # i.e., half a penny AGI_STUB_TO_INCOME_RANGE = { 1: (-np.inf, 1), 2: (1 - epsilon, 10_000), @@ -191,7 +191,7 @@ def transform_soi_data(raw_df): # State ------------------- # You've got agi_stub == 0 in here, which you want to use any time you don't want to - # break things up by AGI + # divide data by AGI classes (i.e., agi_stub) state_df = raw_df.copy().loc[ (raw_df.STATE != "US") & (raw_df.CONG_DISTRICT == 0) ] @@ -200,7 +200,6 @@ def transform_soi_data(raw_df): ).str.zfill(2) # District ------------------ - # This is going to fail because we're missing the single cong district states district_df = raw_df.copy().loc[(raw_df.CONG_DISTRICT > 0)] max_cong_district_by_state = raw_df.groupby("STATE")[ @@ -284,6 +283,7 @@ def transform_soi_data(raw_df): def load_soi_data(long_dfs, year): + """Load a list of databases into the db, critically dependent on order""" DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}" engine = create_engine(DATABASE_URL) @@ -291,7 +291,6 @@ def load_soi_data(long_dfs, year): session = Session(engine) # Load EITC data -------------------------------------------------------- - # Obviously this is not especially robust --- eitc_data = { "0": (long_dfs[0], long_dfs[1]), "1": (long_dfs[2], long_dfs[3]), @@ -377,7 +376,7 @@ def load_soi_data(long_dfs, year): session.commit() - # No breakdown variables in this set + # There are no breakdown variables used in the following set for j in range(8, 42, 2): count_j, amount_j = long_dfs[j], long_dfs[j + 1] amount_variable_name = amount_j.iloc[0][["target_variable"]].values[0] @@ -446,8 +445,7 @@ def load_soi_data(long_dfs, year): agi_stub = agi_df.iloc[0][["breakdown_value"]].values[0] agi_income_lower, agi_income_upper = AGI_STUB_TO_INCOME_RANGE[agi_stub] - # Make a National Stratum for each AGI Stub, even though there's no national target - # There no national target because the data set only has agi_stub = 0 for national + # Make a National Stratum for each AGI Stub even w/o associated national target note = f"Geo: 0100000US, AGI > {agi_income_lower}, AGI < {agi_income_upper}" nat_stratum = Stratum( parent_stratum_id=None, stratum_group_id=0, notes=note @@ -540,7 +538,9 @@ def load_soi_data(long_dfs, year): def main(): - year = 2022 # NOTE: predates the finalization of the 2020 Census redistricting + # NOTE: predates the finalization of the 2020 Census redistricting + # and there is district mapping in the Transform step + year = 2022 # Extract ----------------------- raw_df = extract_soi_data() From 0cf920a157aa30675a5c51ba64ae404380ac1e8c Mon Sep 17 00:00:00 2001 From: Ben Ogorek Date: Mon, 18 Aug 2025 12:52:48 -0400 Subject: [PATCH 22/27] test: move database tests into package --- policyengine_us_data/tests/test_database.py | 110 ++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 policyengine_us_data/tests/test_database.py diff --git a/policyengine_us_data/tests/test_database.py b/policyengine_us_data/tests/test_database.py new file mode 100644 index 00000000..13c102e5 --- /dev/null +++ b/policyengine_us_data/tests/test_database.py @@ -0,0 +1,110 @@ +import hashlib +from enum import Enum + +import pytest +from sqlalchemy.exc import IntegrityError +from sqlmodel import Session, select + +from policyengine_us_data.db.create_database_tables import ( + Stratum, + StratumConstraint, + Target, + create_database, +) +from policyengine_us_data.db import create_initial_strata + + +@pytest.fixture +def engine(tmp_path): + db_uri = f"sqlite:///{tmp_path/'test.db'}" + return create_database(db_uri) + + +def test_stratum_hash_and_relationships(engine): + with Session(engine) as session: + stratum = Stratum(notes="test", stratum_group_id=0) + stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", operation="in", value="0001" + ), + StratumConstraint( + constraint_variable="age", operation="greater_than", value="20" + ), + StratumConstraint( + constraint_variable="age", operation="less_than", value="65" + ), + ] + stratum.targets_rel = [ + Target(variable="person_count", period=2023, value=100.0) + ] + session.add(stratum) + session.commit() + expected_hash = hashlib.sha256( + "\n".join( + sorted( + [ + "ucgid_str|in|0001", + "age|greater_than|20", + "age|less_than|65", + ] + ) + ).encode("utf-8") + ).hexdigest() + assert stratum.definition_hash == expected_hash + retrieved = session.get(Stratum, stratum.stratum_id) + assert len(retrieved.constraints_rel) == 3 + assert retrieved.targets_rel[0].value == 100.0 + + +def test_unique_definition_hash(engine): + with Session(engine) as session: + s1 = Stratum(stratum_group_id=0) + s1.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", operation="in", value="0001" + ) + ] + session.add(s1) + session.commit() + s2 = Stratum(stratum_group_id=0) + s2.constraints_rel = [ + StratumConstraint( + constraint_variable="ucgid_str", operation="in", value="0001" + ) + ] + session.add(s2) + with pytest.raises(IntegrityError): + session.commit() + + +def test_create_initial_strata(monkeypatch, engine, tmp_path): + # ``monkeypatch`` is a pytest fixture that lets us temporarily modify or replace + # objects during a test. Here we use it to point ``STORAGE_FOLDER`` to a + # temporary directory so the test doesn't touch real data on disk. + monkeypatch.setattr(create_initial_strata, "STORAGE_FOLDER", tmp_path) + + class FakeEnum(Enum): + NAT = "NAT" + STATE = "STATE" + DIST = "DIST" + + def get_hierarchical_codes(self): + mapping = { + FakeEnum.NAT: ["NAT"], + FakeEnum.STATE: ["STATE", "NAT"], + FakeEnum.DIST: ["DIST", "STATE", "NAT"], + } + return mapping[self] + + # Replace the real ``UCGID`` enumeration with our simplified version so the + # test can run without downloading geographic data. + monkeypatch.setattr(create_initial_strata, "UCGID", FakeEnum) + create_initial_strata.main() + with Session(engine) as session: + strata = session.exec(select(Stratum).order_by(Stratum.stratum_id)).all() + assert len(strata) == 3 + nat, state, dist = strata + assert state.parent_stratum_id == nat.stratum_id + assert dist.parent_stratum_id == state.stratum_id + codes = [s.constraints_rel[0].value for s in strata] + assert codes == ["NAT", "STATE", "DIST"] From 0571ff547e1fbf73a4f41fda0d5ad51636a7b509 Mon Sep 17 00:00:00 2001 From: Ben Ogorek Date: Mon, 18 Aug 2025 14:31:24 -0400 Subject: [PATCH 23/27] Add Great Expectations validation for database --- Makefile | 13 ++++---- .../checkpoints/policy_data_checkpoint.yml | 13 ++++++++ .../expectations/policy_data_suite.json | 12 +++++++ great_expectations/great_expectations.yml | 31 +++++++++++++++++++ policyengine_us_data/db/validate_database.py | 18 +++++++++++ pyproject.toml | 1 + 6 files changed, 82 insertions(+), 6 deletions(-) create mode 100644 great_expectations/checkpoints/policy_data_checkpoint.yml create mode 100644 great_expectations/expectations/policy_data_suite.json create mode 100644 great_expectations/great_expectations.yml create mode 100644 policyengine_us_data/db/validate_database.py diff --git a/Makefile b/Makefile index 795a77b6..79675070 100644 --- a/Makefile +++ b/Makefile @@ -54,12 +54,13 @@ documentation-dev: myst start database: - python policyengine_us_data/db/create_database_tables.py - python policyengine_us_data/db/create_initial_strata.py - python policyengine_us_data/db/etl_age.py - python policyengine_us_data/db/etl_medicaid.py - python policyengine_us_data/db/etl_snap.py - python policyengine_us_data/db/etl_irs_soi.py + python policyengine_us_data/db/create_database_tables.py + python policyengine_us_data/db/create_initial_strata.py + python policyengine_us_data/db/etl_age.py + python policyengine_us_data/db/etl_medicaid.py + python policyengine_us_data/db/etl_snap.py + python policyengine_us_data/db/etl_irs_soi.py + python policyengine_us_data/db/validate_database.py data: python policyengine_us_data/utils/uprating.py diff --git a/great_expectations/checkpoints/policy_data_checkpoint.yml b/great_expectations/checkpoints/policy_data_checkpoint.yml new file mode 100644 index 00000000..45111b50 --- /dev/null +++ b/great_expectations/checkpoints/policy_data_checkpoint.yml @@ -0,0 +1,13 @@ +name: policy_data_checkpoint +config_version: 1.0 +class_name: SimpleCheckpoint +validations: + - batch_request: + datasource_name: policy_db + data_connector_name: default_runtime_data_connector_name + data_asset_name: strata + runtime_parameters: + query: SELECT * FROM strata + batch_identifiers: + default_identifier_name: default + expectation_suite_name: policy_data_suite diff --git a/great_expectations/expectations/policy_data_suite.json b/great_expectations/expectations/policy_data_suite.json new file mode 100644 index 00000000..7da34546 --- /dev/null +++ b/great_expectations/expectations/policy_data_suite.json @@ -0,0 +1,12 @@ +{ + "expectation_suite_name": "policy_data_suite", + "expectations": [ + { + "expectation_type": "expect_table_row_count_to_be_greater_than", + "kwargs": {"value": 0} + } + ], + "meta": { + "great_expectations_version": "0.18" + } +} diff --git a/great_expectations/great_expectations.yml b/great_expectations/great_expectations.yml new file mode 100644 index 00000000..f4ba7441 --- /dev/null +++ b/great_expectations/great_expectations.yml @@ -0,0 +1,31 @@ +config_version: 3.0 +datasources: + policy_db: + class_name: Datasource + execution_engine: + class_name: SqlAlchemyExecutionEngine + connection_string: sqlite:///policyengine_us_data/storage/policy_data.db + data_connectors: + default_runtime_data_connector_name: + class_name: RuntimeDataConnector + batch_identifiers: + - default_identifier_name +stores: + expectations_store: + class_name: ExpectationsStore + store_backend: + class_name: InlineStoreBackend + validations_store: + class_name: ValidationsStore + store_backend: + class_name: InlineStoreBackend + checkpoint_store: + class_name: CheckpointStore + store_backend: + class_name: InlineStoreBackend +expectations_store_name: expectations_store +validations_store_name: validations_store +checkpoint_store_name: checkpoint_store +data_docs_sites: {} +anonymous_usage_statistics: + enabled: false diff --git a/policyengine_us_data/db/validate_database.py b/policyengine_us_data/db/validate_database.py new file mode 100644 index 00000000..4ea1203d --- /dev/null +++ b/policyengine_us_data/db/validate_database.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import great_expectations as ge + + +def main() -> None: + """Run Great Expectations validation on the policy data database.""" + # Ensure we load the DataContext from the repository root + context = ge.get_context() + # Execute the checkpoint configured for the policy database + result = context.run_checkpoint(checkpoint_name="policy_data_checkpoint") + if not result["success"]: + raise ValueError("Great Expectations validation failed") + print("Great Expectations validation succeeded") + + +if __name__ == "__main__": # pragma: no cover - script entry point + main() diff --git a/pyproject.toml b/pyproject.toml index 3e87a403..16ca41f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dependencies = [ "sqlalchemy>=2.0.41", "sqlmodel>=0.0.24", "xlrd>=2.0.2", + "great_expectations>=0.18.0", ] [project.optional-dependencies] From bdef5011c93ae91c44b21a8414fb7a8e3769e955 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 18 Aug 2025 18:49:24 -0400 Subject: [PATCH 24/27] working pre lint --- Makefile | 14 ++-- .../checkpoints/policy_data_checkpoint.yml | 13 ---- .../expectations/policy_data_suite.json | 12 --- great_expectations/great_expectations.yml | 31 -------- .../db/create_database_tables.py | 12 ++- policyengine_us_data/db/etl_irs_soi.py | 76 +++++++------------ policyengine_us_data/db/validate_database.py | 31 ++++---- policyengine_us_data/tests/test_database.py | 42 +--------- pyproject.toml | 1 - 9 files changed, 65 insertions(+), 167 deletions(-) delete mode 100644 great_expectations/checkpoints/policy_data_checkpoint.yml delete mode 100644 great_expectations/expectations/policy_data_suite.json delete mode 100644 great_expectations/great_expectations.yml diff --git a/Makefile b/Makefile index 79675070..b03e23d5 100644 --- a/Makefile +++ b/Makefile @@ -54,13 +54,13 @@ documentation-dev: myst start database: - python policyengine_us_data/db/create_database_tables.py - python policyengine_us_data/db/create_initial_strata.py - python policyengine_us_data/db/etl_age.py - python policyengine_us_data/db/etl_medicaid.py - python policyengine_us_data/db/etl_snap.py - python policyengine_us_data/db/etl_irs_soi.py - python policyengine_us_data/db/validate_database.py + python policyengine_us_data/db/create_database_tables.py + python policyengine_us_data/db/create_initial_strata.py + python policyengine_us_data/db/etl_age.py + python policyengine_us_data/db/etl_medicaid.py + python policyengine_us_data/db/etl_snap.py + python policyengine_us_data/db/etl_irs_soi.py + python policyengine_us_data/db/validate_database.py data: python policyengine_us_data/utils/uprating.py diff --git a/great_expectations/checkpoints/policy_data_checkpoint.yml b/great_expectations/checkpoints/policy_data_checkpoint.yml deleted file mode 100644 index 45111b50..00000000 --- a/great_expectations/checkpoints/policy_data_checkpoint.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: policy_data_checkpoint -config_version: 1.0 -class_name: SimpleCheckpoint -validations: - - batch_request: - datasource_name: policy_db - data_connector_name: default_runtime_data_connector_name - data_asset_name: strata - runtime_parameters: - query: SELECT * FROM strata - batch_identifiers: - default_identifier_name: default - expectation_suite_name: policy_data_suite diff --git a/great_expectations/expectations/policy_data_suite.json b/great_expectations/expectations/policy_data_suite.json deleted file mode 100644 index 7da34546..00000000 --- a/great_expectations/expectations/policy_data_suite.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "expectation_suite_name": "policy_data_suite", - "expectations": [ - { - "expectation_type": "expect_table_row_count_to_be_greater_than", - "kwargs": {"value": 0} - } - ], - "meta": { - "great_expectations_version": "0.18" - } -} diff --git a/great_expectations/great_expectations.yml b/great_expectations/great_expectations.yml deleted file mode 100644 index f4ba7441..00000000 --- a/great_expectations/great_expectations.yml +++ /dev/null @@ -1,31 +0,0 @@ -config_version: 3.0 -datasources: - policy_db: - class_name: Datasource - execution_engine: - class_name: SqlAlchemyExecutionEngine - connection_string: sqlite:///policyengine_us_data/storage/policy_data.db - data_connectors: - default_runtime_data_connector_name: - class_name: RuntimeDataConnector - batch_identifiers: - - default_identifier_name -stores: - expectations_store: - class_name: ExpectationsStore - store_backend: - class_name: InlineStoreBackend - validations_store: - class_name: ValidationsStore - store_backend: - class_name: InlineStoreBackend - checkpoint_store: - class_name: CheckpointStore - store_backend: - class_name: InlineStoreBackend -expectations_store_name: expectations_store -validations_store_name: validations_store -checkpoint_store_name: checkpoint_store -data_docs_sites: {} -anonymous_usage_statistics: - enabled: false diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py index d6cfc8ec..4c451467 100644 --- a/policyengine_us_data/db/create_database_tables.py +++ b/policyengine_us_data/db/create_database_tables.py @@ -1,19 +1,21 @@ import logging import hashlib from typing import List, Optional +from enum import Enum from sqlalchemy import event, UniqueConstraint from sqlalchemy.orm.attributes import get_history - from sqlmodel import ( Field, Relationship, SQLModel, create_engine, ) +from policyengine_us.system import system from policyengine_us_data.storage import STORAGE_FOLDER + logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", @@ -22,6 +24,10 @@ logger = logging.getLogger(__name__) +# An Enum type to ensure the variable exists in policyengine-us +USVariable = Enum("USVariable", {name: name for name in system.variables.keys()}, type=str) + + class Stratum(SQLModel, table=True): """Represents a unique population subgroup (stratum).""" @@ -81,7 +87,7 @@ class StratumConstraint(SQLModel, table=True): __tablename__ = "stratum_constraints" stratum_id: int = Field(foreign_key="strata.stratum_id", primary_key=True) - constraint_variable: str = Field( + constraint_variable: USVariable = Field( primary_key=True, description="The variable the constraint applies to (e.g., 'age').", ) @@ -114,7 +120,7 @@ class Target(SQLModel, table=True): ) target_id: Optional[int] = Field(default=None, primary_key=True) - variable: str = Field( + variable: USVariable = Field( description="A variable defined in policyengine-us (e.g., 'income_tax')." ) period: int = Field( diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index 74abab9e..bda932fd 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -80,9 +80,9 @@ def make_records( def make_agi_long(df: pd.DataFrame) -> pd.DataFrame: """Convert IRS SOI AGI‑split table from wide to the long format used""" target_col_map = { - "N1": "agi_tax_unit_count", - "N2": "agi_person_count", - "A00100": "agi_total_amount", + "N1": "tax_unit_count", + "N2": "person_count", + "A00100": "adjusted_gross_income", } work = df[["ucgid_str", "agi_stub"] + list(target_col_map)].rename( columns=target_col_map @@ -156,32 +156,27 @@ def extract_soi_data() -> pd.DataFrame: def transform_soi_data(raw_df): TARGETS = [ - dict(code="59661", name="eitc", breakdown=("eitc_children", 0)), - dict(code="59662", name="eitc", breakdown=("eitc_children", 1)), - dict(code="59663", name="eitc", breakdown=("eitc_children", 2)), - dict(code="59664", name="eitc", breakdown=("eitc_children", "3+")), - dict(code="59664", name="qbid", breakdown=None), + dict(code="59661", name="eitc", breakdown=("eitc_child_count", 0)), + dict(code="59662", name="eitc", breakdown=("eitc_child_count", 1)), + dict(code="59663", name="eitc", breakdown=("eitc_child_count", 2)), + dict(code="59664", name="eitc", breakdown=("eitc_child_count", "3+")), + dict(code="59664", name="qualified_business_income_deduction", breakdown=None), dict(code="18500", name="real_estate_taxes", breakdown=None), dict(code="01000", name="net_capital_gain", breakdown=None), - dict(code="03150", name="ira_payments", breakdown=None), - dict(code="00300", name="taxable_interest", breakdown=None), - dict(code="00400", name="tax_exempt_interest", breakdown=None), - dict(code="00600", name="oridinary_dividends", breakdown=None), - dict(code="00650", name="qualified_dividends", breakdown=None), + dict(code="03150", name="retirement_distributions", breakdown=None), + dict(code="00300", name="taxable_interest_income", breakdown=None), + dict(code="00400", name="tax_exempt_interest_income", breakdown=None), + dict(code="00600", name="non_qualified_dividend_income", breakdown=None), + dict(code="00650", name="qualified_dividend_income", breakdown=None), dict( code="26270", - name="partnership_and_s_crop_net_income", + name="partnership_s_corp_income", breakdown=None, ), - dict(code="02500", name="total_social_security", breakdown=None), - dict(code="01700", name="pension_and_annuities", breakdown=None), + dict(code="02500", name="social_security", breakdown=None), dict(code="02300", name="unemployment_compensation", breakdown=None), - dict(code="00900", name="business_net_income", breakdown=None), - dict( - code="17000", name="medical_and_dental_deduction", breakdown=None - ), - dict(code="00700", name="salt_refunds", breakdown=None), - dict(code="18425", name="salt_amount", breakdown=None), + dict(code="00700", name="salt_refund_income", breakdown=None), + dict(code="18425", name="reported_salt", breakdown=None), dict(code="06500", name="income_tax", breakdown=None), ] @@ -257,7 +252,7 @@ def transform_soi_data(raw_df): temp_df = df[["ucgid_str"]].copy() temp_df["breakdown_variable"] = "one" temp_df["breakdown_value"] = 1 - temp_df["target_variable"] = "agi" + temp_df["target_variable"] = "adjusted_gross_income" temp_df["target_value"] = df["A00100"] * 1_000 records.append(temp_df) @@ -349,14 +344,6 @@ def load_soi_data(long_dfs, year): ) new_stratum.targets_rel = [ - # It's already complex enough - # Target( - # variable="tax_unit_count", - # period=year, - # value=eitc_count_i.iloc[i][["target_value"]].values[0], - # source_id=5, - # active=True, - # ), Target( variable="eitc", period=year, @@ -377,7 +364,7 @@ def load_soi_data(long_dfs, year): session.commit() # There are no breakdown variables used in the following set - for j in range(8, 42, 2): + for j in range(8, 36, 2): count_j, amount_j = long_dfs[j], long_dfs[j + 1] amount_variable_name = amount_j.iloc[0][["target_variable"]].values[0] print( @@ -391,16 +378,6 @@ def load_soi_data(long_dfs, year): amount_value = amount_j.iloc[i][["target_value"]].values[0] stratum.targets_rel.append( - # NOTE: If I do the counts, I'm going to need to explode the strata for the vars != 0 - # OR, create new variables like qbid_tax_unit_count which requires adding stuff to -us - # AND, it's already complex enough ----- - # Target( - # variable="tax_unit_count", - # period=year, - # value=count_j.iloc[i][["target_value"]].values[0], - # source_id=5, - # active=True, - # ), Target( variable=amount_variable_name, period=year, @@ -416,14 +393,15 @@ def load_soi_data(long_dfs, year): session.commit() # Adjusted Gross Income ------ - agi_values = long_dfs[42] + agi_values = long_dfs[36] + assert agi_values[['target_variable']].values[0] == 'adjusted_gross_income' for i in range(agi_values.shape[0]): ucgid_i = agi_values[["ucgid_str"]].iloc[i].values[0] stratum = get_simple_stratum_by_ucgid(session, ucgid_i) stratum.targets_rel.append( Target( - variable="agi", + variable="adjusted_gross_income", period=year, value=agi_values.iloc[i][["target_value"]].values[0], source_id=5, @@ -438,7 +416,7 @@ def load_soi_data(long_dfs, year): agi_person_count_dfs = [ df for df in long_dfs[43:] - if df["target_variable"].iloc[0] == "agi_person_count" + if df["target_variable"].iloc[0] == "person_count" ] for agi_df in agi_person_count_dfs: @@ -458,12 +436,12 @@ def load_soi_data(long_dfs, year): value="0100000US", ), StratumConstraint( - constraint_variable="agi", + constraint_variable="adjusted_gross_income", operation="greater_than", value=str(agi_income_lower), ), StratumConstraint( - constraint_variable="agi", + constraint_variable="adjusted_gross_income", operation="less_than", value=str(agi_income_upper), ), @@ -505,12 +483,12 @@ def load_soi_data(long_dfs, year): value=ucgid_i, ), StratumConstraint( - constraint_variable="agi", + constraint_variable="adjusted_gross_income", operation="greater_than", value=str(agi_income_lower), ), StratumConstraint( - constraint_variable="agi", + constraint_variable="adjusted_gross_income", operation="less_than", value=str(agi_income_upper), ), diff --git a/policyengine_us_data/db/validate_database.py b/policyengine_us_data/db/validate_database.py index 4ea1203d..94dcdddf 100644 --- a/policyengine_us_data/db/validate_database.py +++ b/policyengine_us_data/db/validate_database.py @@ -1,18 +1,23 @@ -from __future__ import annotations +""" +This is the start of a data validation pipeline. It is meant to be a separate +validation track from the unit tests in policyengine_us_data/tests in that it tests +the overall correctness of data after a full pipeline run with production data. +""" +import sqlite3 -import great_expectations as ge +import pandas as pd +from policyengine_us.system import system -def main() -> None: - """Run Great Expectations validation on the policy data database.""" - # Ensure we load the DataContext from the repository root - context = ge.get_context() - # Execute the checkpoint configured for the policy database - result = context.run_checkpoint(checkpoint_name="policy_data_checkpoint") - if not result["success"]: - raise ValueError("Great Expectations validation failed") - print("Great Expectations validation succeeded") +conn = sqlite3.connect("policyengine_us_data/storage/policy_data.db") +stratum_constraints_df = pd.read_sql("SELECT * FROM stratum_constraints", conn) +targets_df = pd.read_sql("SELECT * FROM targets", conn) -if __name__ == "__main__": # pragma: no cover - script entry point - main() +for var_name in set(targets_df['variable']): + if not var_name in system.variables.keys(): + raise ValueError(f'{var_name} not a policyengine-us variable') + +for var_name in set(stratum_constraints_df['constraint_variable']): + if not var_name in system.variables.keys(): + raise ValueError(f'{var_name} not a policyengine-us variable') diff --git a/policyengine_us_data/tests/test_database.py b/policyengine_us_data/tests/test_database.py index 13c102e5..ce5eb211 100644 --- a/policyengine_us_data/tests/test_database.py +++ b/policyengine_us_data/tests/test_database.py @@ -11,7 +11,6 @@ Target, create_database, ) -from policyengine_us_data.db import create_initial_strata @pytest.fixture @@ -25,7 +24,7 @@ def test_stratum_hash_and_relationships(engine): stratum = Stratum(notes="test", stratum_group_id=0) stratum.constraints_rel = [ StratumConstraint( - constraint_variable="ucgid_str", operation="in", value="0001" + constraint_variable="ucgid_str", operation="equals", value="0400000US30" ), StratumConstraint( constraint_variable="age", operation="greater_than", value="20" @@ -43,7 +42,7 @@ def test_stratum_hash_and_relationships(engine): "\n".join( sorted( [ - "ucgid_str|in|0001", + "ucgid_str|equals|0400000US30", "age|greater_than|20", "age|less_than|65", ] @@ -61,7 +60,7 @@ def test_unique_definition_hash(engine): s1 = Stratum(stratum_group_id=0) s1.constraints_rel = [ StratumConstraint( - constraint_variable="ucgid_str", operation="in", value="0001" + constraint_variable="ucgid_str", operation="equals", value="0400000US30" ) ] session.add(s1) @@ -69,42 +68,9 @@ def test_unique_definition_hash(engine): s2 = Stratum(stratum_group_id=0) s2.constraints_rel = [ StratumConstraint( - constraint_variable="ucgid_str", operation="in", value="0001" + constraint_variable="ucgid_str", operation="equals", value="0400000US30" ) ] session.add(s2) with pytest.raises(IntegrityError): session.commit() - - -def test_create_initial_strata(monkeypatch, engine, tmp_path): - # ``monkeypatch`` is a pytest fixture that lets us temporarily modify or replace - # objects during a test. Here we use it to point ``STORAGE_FOLDER`` to a - # temporary directory so the test doesn't touch real data on disk. - monkeypatch.setattr(create_initial_strata, "STORAGE_FOLDER", tmp_path) - - class FakeEnum(Enum): - NAT = "NAT" - STATE = "STATE" - DIST = "DIST" - - def get_hierarchical_codes(self): - mapping = { - FakeEnum.NAT: ["NAT"], - FakeEnum.STATE: ["STATE", "NAT"], - FakeEnum.DIST: ["DIST", "STATE", "NAT"], - } - return mapping[self] - - # Replace the real ``UCGID`` enumeration with our simplified version so the - # test can run without downloading geographic data. - monkeypatch.setattr(create_initial_strata, "UCGID", FakeEnum) - create_initial_strata.main() - with Session(engine) as session: - strata = session.exec(select(Stratum).order_by(Stratum.stratum_id)).all() - assert len(strata) == 3 - nat, state, dist = strata - assert state.parent_stratum_id == nat.stratum_id - assert dist.parent_stratum_id == state.stratum_id - codes = [s.constraints_rel[0].value for s in strata] - assert codes == ["NAT", "STATE", "DIST"] diff --git a/pyproject.toml b/pyproject.toml index 16ca41f0..3e87a403 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,6 @@ dependencies = [ "sqlalchemy>=2.0.41", "sqlmodel>=0.0.24", "xlrd>=2.0.2", - "great_expectations>=0.18.0", ] [project.optional-dependencies] From d295926887f14bf802ab23e3742ed7a3aaa942e5 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 18 Aug 2025 18:53:42 -0400 Subject: [PATCH 25/27] post lint --- .../db/create_database_tables.py | 6 ++++-- policyengine_us_data/db/etl_irs_soi.py | 16 +++++++++++----- policyengine_us_data/db/validate_database.py | 11 ++++++----- policyengine_us_data/tests/test_database.py | 12 +++++++++--- 4 files changed, 30 insertions(+), 15 deletions(-) diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py index 4c451467..df03772d 100644 --- a/policyengine_us_data/db/create_database_tables.py +++ b/policyengine_us_data/db/create_database_tables.py @@ -11,7 +11,7 @@ SQLModel, create_engine, ) -from policyengine_us.system import system +from policyengine_us.system import system from policyengine_us_data.storage import STORAGE_FOLDER @@ -25,7 +25,9 @@ # An Enum type to ensure the variable exists in policyengine-us -USVariable = Enum("USVariable", {name: name for name in system.variables.keys()}, type=str) +USVariable = Enum( + "USVariable", {name: name for name in system.variables.keys()}, type=str +) class Stratum(SQLModel, table=True): diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index bda932fd..e191acb2 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -160,13 +160,19 @@ def transform_soi_data(raw_df): dict(code="59662", name="eitc", breakdown=("eitc_child_count", 1)), dict(code="59663", name="eitc", breakdown=("eitc_child_count", 2)), dict(code="59664", name="eitc", breakdown=("eitc_child_count", "3+")), - dict(code="59664", name="qualified_business_income_deduction", breakdown=None), + dict( + code="59664", + name="qualified_business_income_deduction", + breakdown=None, + ), dict(code="18500", name="real_estate_taxes", breakdown=None), dict(code="01000", name="net_capital_gain", breakdown=None), dict(code="03150", name="retirement_distributions", breakdown=None), dict(code="00300", name="taxable_interest_income", breakdown=None), dict(code="00400", name="tax_exempt_interest_income", breakdown=None), - dict(code="00600", name="non_qualified_dividend_income", breakdown=None), + dict( + code="00600", name="non_qualified_dividend_income", breakdown=None + ), dict(code="00650", name="qualified_dividend_income", breakdown=None), dict( code="26270", @@ -329,7 +335,7 @@ def load_soi_data(long_dfs, year): if n_children == "3+": new_stratum.constraints_rel.append( StratumConstraint( - constraint_variable="eitc_children", + constraint_variable="eitc_child_count", operation="greater_than", value="2", ) @@ -337,7 +343,7 @@ def load_soi_data(long_dfs, year): else: new_stratum.constraints_rel.append( StratumConstraint( - constraint_variable="eitc_children", + constraint_variable="eitc_child_count", operation="equals", value=f"{n_children}", ) @@ -394,7 +400,7 @@ def load_soi_data(long_dfs, year): # Adjusted Gross Income ------ agi_values = long_dfs[36] - assert agi_values[['target_variable']].values[0] == 'adjusted_gross_income' + assert agi_values[["target_variable"]].values[0] == "adjusted_gross_income" for i in range(agi_values.shape[0]): ucgid_i = agi_values[["ucgid_str"]].iloc[i].values[0] diff --git a/policyengine_us_data/db/validate_database.py b/policyengine_us_data/db/validate_database.py index 94dcdddf..fee6a49d 100644 --- a/policyengine_us_data/db/validate_database.py +++ b/policyengine_us_data/db/validate_database.py @@ -3,10 +3,11 @@ validation track from the unit tests in policyengine_us_data/tests in that it tests the overall correctness of data after a full pipeline run with production data. """ + import sqlite3 import pandas as pd -from policyengine_us.system import system +from policyengine_us.system import system conn = sqlite3.connect("policyengine_us_data/storage/policy_data.db") @@ -14,10 +15,10 @@ stratum_constraints_df = pd.read_sql("SELECT * FROM stratum_constraints", conn) targets_df = pd.read_sql("SELECT * FROM targets", conn) -for var_name in set(targets_df['variable']): +for var_name in set(targets_df["variable"]): if not var_name in system.variables.keys(): - raise ValueError(f'{var_name} not a policyengine-us variable') + raise ValueError(f"{var_name} not a policyengine-us variable") -for var_name in set(stratum_constraints_df['constraint_variable']): +for var_name in set(stratum_constraints_df["constraint_variable"]): if not var_name in system.variables.keys(): - raise ValueError(f'{var_name} not a policyengine-us variable') + raise ValueError(f"{var_name} not a policyengine-us variable") diff --git a/policyengine_us_data/tests/test_database.py b/policyengine_us_data/tests/test_database.py index ce5eb211..64060b48 100644 --- a/policyengine_us_data/tests/test_database.py +++ b/policyengine_us_data/tests/test_database.py @@ -24,7 +24,9 @@ def test_stratum_hash_and_relationships(engine): stratum = Stratum(notes="test", stratum_group_id=0) stratum.constraints_rel = [ StratumConstraint( - constraint_variable="ucgid_str", operation="equals", value="0400000US30" + constraint_variable="ucgid_str", + operation="equals", + value="0400000US30", ), StratumConstraint( constraint_variable="age", operation="greater_than", value="20" @@ -60,7 +62,9 @@ def test_unique_definition_hash(engine): s1 = Stratum(stratum_group_id=0) s1.constraints_rel = [ StratumConstraint( - constraint_variable="ucgid_str", operation="equals", value="0400000US30" + constraint_variable="ucgid_str", + operation="equals", + value="0400000US30", ) ] session.add(s1) @@ -68,7 +72,9 @@ def test_unique_definition_hash(engine): s2 = Stratum(stratum_group_id=0) s2.constraints_rel = [ StratumConstraint( - constraint_variable="ucgid_str", operation="equals", value="0400000US30" + constraint_variable="ucgid_str", + operation="equals", + value="0400000US30", ) ] session.add(s2) From bd104d002d05c86d680cfcc64d4448b0de324f07 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Tue, 19 Aug 2025 23:19:42 -0400 Subject: [PATCH 26/27] updating IRS target variables --- policyengine_us_data/db/etl_irs_soi.py | 34 +++++++++++++++++--------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index e191acb2..91f40121 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -161,28 +161,31 @@ def transform_soi_data(raw_df): dict(code="59663", name="eitc", breakdown=("eitc_child_count", 2)), dict(code="59664", name="eitc", breakdown=("eitc_child_count", "3+")), dict( - code="59664", + code="04475", name="qualified_business_income_deduction", breakdown=None, ), dict(code="18500", name="real_estate_taxes", breakdown=None), dict(code="01000", name="net_capital_gain", breakdown=None), - dict(code="03150", name="retirement_distributions", breakdown=None), + dict(code="01400", name="taxable_ira_distributions", breakdown=None), dict(code="00300", name="taxable_interest_income", breakdown=None), dict(code="00400", name="tax_exempt_interest_income", breakdown=None), - dict( - code="00600", name="non_qualified_dividend_income", breakdown=None - ), + dict(code="00600", name="dividend_income", breakdown=None), dict(code="00650", name="qualified_dividend_income", breakdown=None), dict( code="26270", - name="partnership_s_corp_income", + name="tax_unit_partnership_s_corp_income", breakdown=None, ), - dict(code="02500", name="social_security", breakdown=None), + dict(code="02500", name="taxable_social_security", breakdown=None), dict(code="02300", name="unemployment_compensation", breakdown=None), - dict(code="00700", name="salt_refund_income", breakdown=None), - dict(code="18425", name="reported_salt", breakdown=None), + dict(code="17000", name="medical_expense_deduction", breakdown=None), + dict(code="01700", name="taxable_pension_income", breakdown=None), + dict(code="11070", name="refundable_ctc", breakdown=None), + # NOTE: A18460 is the capped SALT deduction and matches the `salt` variable. + # Our SALT base currently excludes personal property taxes (not modeled yet), + # so amounts may be slightly below IRS totals. + dict(code="18460", name="salt", breakdown=None), dict(code="06500", name="income_tax", breakdown=None), ] @@ -370,7 +373,14 @@ def load_soi_data(long_dfs, year): session.commit() # There are no breakdown variables used in the following set - for j in range(8, 36, 2): + first_agi_index = [ + i + for i in range(len(long_dfs)) + if long_dfs[i][["target_variable"]].values[0] + == "adjusted_gross_income" + and long_dfs[i][["breakdown_variable"]].values[0] == "one" + ][0] + for j in range(8, first_agi_index, 2): count_j, amount_j = long_dfs[j], long_dfs[j + 1] amount_variable_name = amount_j.iloc[0][["target_variable"]].values[0] print( @@ -399,7 +409,7 @@ def load_soi_data(long_dfs, year): session.commit() # Adjusted Gross Income ------ - agi_values = long_dfs[36] + agi_values = long_dfs[first_agi_index] assert agi_values[["target_variable"]].values[0] == "adjusted_gross_income" for i in range(agi_values.shape[0]): @@ -421,7 +431,7 @@ def load_soi_data(long_dfs, year): agi_person_count_dfs = [ df - for df in long_dfs[43:] + for df in long_dfs[(first_agi_index + 1) :] if df["target_variable"].iloc[0] == "person_count" ] From 28753119e3e7c26663cf96a0d06902679903bd42 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Wed, 20 Aug 2025 13:53:22 -0400 Subject: [PATCH 27/27] changing the salt variable to uncapped --- policyengine_us_data/db/etl_irs_soi.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index 91f40121..786abb1c 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -182,10 +182,7 @@ def transform_soi_data(raw_df): dict(code="17000", name="medical_expense_deduction", breakdown=None), dict(code="01700", name="taxable_pension_income", breakdown=None), dict(code="11070", name="refundable_ctc", breakdown=None), - # NOTE: A18460 is the capped SALT deduction and matches the `salt` variable. - # Our SALT base currently excludes personal property taxes (not modeled yet), - # so amounts may be slightly below IRS totals. - dict(code="18460", name="salt", breakdown=None), + dict(code="18425", name="salt", breakdown=None), dict(code="06500", name="income_tax", breakdown=None), ]