From cd6cf58763ef079a2f434bd3c3915de31f7a1d3b Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Tue, 29 Jul 2025 08:56:11 -0400
Subject: [PATCH 01/27] first round of eitc targets are added

---
 .../db/load_treasury_targets.py               | 162 ++++++++++++++++++
 pyproject.toml                                |   1 +
 2 files changed, 163 insertions(+)
 create mode 100644 policyengine_us_data/db/load_treasury_targets.py

diff --git a/policyengine_us_data/db/load_treasury_targets.py b/policyengine_us_data/db/load_treasury_targets.py
new file mode 100644
index 00000000..4326f269
--- /dev/null
+++ b/policyengine_us_data/db/load_treasury_targets.py
@@ -0,0 +1,162 @@
+import logging
+import requests
+from pathlib import Path
+import io
+
+import pandas as pd
+import numpy as np
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+
+from policyengine_us_data.db.create_database_tables import (
+    Stratum,
+    StratumConstraint,
+    Target,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+def extract_eitc_data():
+    # IRS Table 2.5, Tax Year 2020S
+    url = "https://www.irs.gov/pub/irs-soi/20in25ic.xls"
+    r   = requests.get(url, timeout=30)
+    r.raise_for_status()
+    
+    # Pandas uses xlrd to open .xls
+    xls = pd.ExcelFile(io.BytesIO(r.content), engine="xlrd")
+    sheets = {name: xls.parse(name, header=None) for name in xls.sheet_names}
+    
+    raw = sheets[xls.sheet_names[0]]
+    return raw
+
+
+def transform_eitc_data(raw_data):
+    # This is not ideal from a data processing standpoint, but it's too much
+    # effort to fully parse this hierarchical XLS for a few data points
+    # At least the full lineage is represented from the source
+
+    zero_children_returns = raw_data.iloc[8, 25]
+    zero_children_amount = raw_data.iloc[8, 26] * 1000
+    
+    one_child_returns = raw_data.iloc[8, 39]
+    one_child_amount = raw_data.iloc[8, 40] * 1000
+    
+    two_children_returns = raw_data.iloc[8, 57]
+    two_children_amount = raw_data.iloc[8, 58] * 1000
+
+    three_plus_children_returns = raw_data.iloc[8, 73]
+    three_plus_children_amount = raw_data.iloc[8, 74] * 1000
+
+    assert zero_children_returns == 7636714 
+    assert zero_children_amount ==  2255068000
+
+    df_long = pd.DataFrame([
+        ["0100000US", "children_equal_to", 0, "tax_unit_count", zero_children_returns],
+        ["0100000US", "children_equal_to", 1, "tax_unit_count", one_child_returns],
+        ["0100000US", "children_equal_to", 2, "tax_unit_count", two_children_returns],
+        ["0100000US", "children_greater_or_equal_to", 3, "tax_unit_count", three_plus_children_returns],
+        ["0100000US", "children_equal_to", 0, "eitc", zero_children_amount],
+        ["0100000US", "children_equal_to", 1, "eitc", one_child_returns],
+        ["0100000US", "children_equal_to", 2, "eitc", two_children_returns],
+        ["0100000US", "children_greater_or_equal_to", 3, "eitc", three_plus_children_returns],
+    ])
+
+    df_long.columns = ["ucgid", "constraint", "constraint_value", "variable", "value"] 
+   
+    df_long["period"] = 2020
+    df_long["reform_id"] = 0
+    df_long["source_id"] = 2
+    df_long["active"] = True
+
+    return df_long
+
+
+def load_eitc_data(df_long):
+
+    DATABASE_URL = "sqlite:///policy_data.db"
+    engine = create_engine(DATABASE_URL)
+
+    Session = sessionmaker(bind=engine)
+    session = Session()
+
+    ucgid = df_long.iloc[0]['ucgid']
+    for num_children in [0, 1, 2, 3]:
+        note = f"eitc_child_count: {num_children}, Geo: {ucgid}"
+        new_stratum = Stratum(
+            parent_stratum_id=None, stratum_group_id=0, notes=note
+        )
+
+        new_stratum.constraints_rel = [
+            StratumConstraint(
+                constraint_variable="ucgid",
+                operation="equals",
+                value=ucgid,
+            ),
+        ]
+
+        if num_children <= 2:
+            new_stratum.constraints_rel.append(
+                StratumConstraint(
+                        constraint_variable="eitc_child_count",
+                        operation="equals",
+                        value=str(num_children),
+                ),
+            )
+        elif num_children > 2:
+            new_stratum.constraints_rel.append(
+                StratumConstraint(
+                        constraint_variable="eitc_child_count",
+                        operation="greater_or_equal_than",
+                        value=str(3),
+                ),
+            )
+
+        rows = df_long.loc[df_long['constraint_value'] == num_children]
+        count_target = rows.loc[rows.variable == 'tax_unit_count']['value'].values[0]
+        amount_target = rows.loc[rows.variable == 'eitc']['value'].values[0]
+
+        # Avoiding magic numbers in the load step
+        count_active = rows.loc[rows.variable == 'tax_unit_count']['active'].values[0]
+        amount_active = rows.loc[rows.variable == 'eitc']['active'].values[0]
+
+        period = rows.iloc[0]['period']
+        source_id = rows.iloc[0]['source_id']
+
+        new_stratum.targets_rel = [
+            Target(
+                variable="eitc",
+                period=period,
+                value=amount_target,
+                source_id=source_id,
+                active=amount_active,
+            ),
+            Target(
+                variable="tax_unit_count",
+                period=period,
+                value=amount_target,
+                source_id=source_id,
+                active=count_active,
+            ),
+        ]
+
+        session.add(new_stratum)
+        session.flush()
+        print(new_stratum.stratum_id)
+
+    session.commit()
+
+
+if __name__ == "__main__":
+
+    # --- ETL: Extract, Transform, Load ----
+
+    # ---- Extract ----------
+    national_df = extract_eitc_data()
+
+    # --- Transform ----------
+    long_national_df = transform_eitc_data(national_df)
+
+    # --- Load --------
+    state_strata_lku = load_eitc_data(long_national_df)
diff --git a/pyproject.toml b/pyproject.toml
index aac1a318..21e53d17 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ dependencies = [
     "us>=2.0.0",
     "sqlalchemy>=2.0.41",
     "sqlmodel>=0.0.24",
+    "xlrd>=2.0.2",
 ]
 
 [project.optional-dependencies]

From 867bec647f48d2c675aa580fc10241c65154801d Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Tue, 29 Jul 2025 08:56:44 -0400
Subject: [PATCH 02/27] linting

---
 .../db/load_treasury_targets.py               | 124 +++++++++++++-----
 1 file changed, 89 insertions(+), 35 deletions(-)

diff --git a/policyengine_us_data/db/load_treasury_targets.py b/policyengine_us_data/db/load_treasury_targets.py
index 4326f269..20d52cef 100644
--- a/policyengine_us_data/db/load_treasury_targets.py
+++ b/policyengine_us_data/db/load_treasury_targets.py
@@ -21,13 +21,13 @@
 def extract_eitc_data():
     # IRS Table 2.5, Tax Year 2020S
     url = "https://www.irs.gov/pub/irs-soi/20in25ic.xls"
-    r   = requests.get(url, timeout=30)
+    r = requests.get(url, timeout=30)
     r.raise_for_status()
-    
+
     # Pandas uses xlrd to open .xls
     xls = pd.ExcelFile(io.BytesIO(r.content), engine="xlrd")
     sheets = {name: xls.parse(name, header=None) for name in xls.sheet_names}
-    
+
     raw = sheets[xls.sheet_names[0]]
     return raw
 
@@ -39,32 +39,82 @@ def transform_eitc_data(raw_data):
 
     zero_children_returns = raw_data.iloc[8, 25]
     zero_children_amount = raw_data.iloc[8, 26] * 1000
-    
+
     one_child_returns = raw_data.iloc[8, 39]
     one_child_amount = raw_data.iloc[8, 40] * 1000
-    
+
     two_children_returns = raw_data.iloc[8, 57]
     two_children_amount = raw_data.iloc[8, 58] * 1000
 
     three_plus_children_returns = raw_data.iloc[8, 73]
     three_plus_children_amount = raw_data.iloc[8, 74] * 1000
 
-    assert zero_children_returns == 7636714 
-    assert zero_children_amount ==  2255068000
-
-    df_long = pd.DataFrame([
-        ["0100000US", "children_equal_to", 0, "tax_unit_count", zero_children_returns],
-        ["0100000US", "children_equal_to", 1, "tax_unit_count", one_child_returns],
-        ["0100000US", "children_equal_to", 2, "tax_unit_count", two_children_returns],
-        ["0100000US", "children_greater_or_equal_to", 3, "tax_unit_count", three_plus_children_returns],
-        ["0100000US", "children_equal_to", 0, "eitc", zero_children_amount],
-        ["0100000US", "children_equal_to", 1, "eitc", one_child_returns],
-        ["0100000US", "children_equal_to", 2, "eitc", two_children_returns],
-        ["0100000US", "children_greater_or_equal_to", 3, "eitc", three_plus_children_returns],
-    ])
-
-    df_long.columns = ["ucgid", "constraint", "constraint_value", "variable", "value"] 
-   
+    assert zero_children_returns == 7636714
+    assert zero_children_amount == 2255068000
+
+    df_long = pd.DataFrame(
+        [
+            [
+                "0100000US",
+                "children_equal_to",
+                0,
+                "tax_unit_count",
+                zero_children_returns,
+            ],
+            [
+                "0100000US",
+                "children_equal_to",
+                1,
+                "tax_unit_count",
+                one_child_returns,
+            ],
+            [
+                "0100000US",
+                "children_equal_to",
+                2,
+                "tax_unit_count",
+                two_children_returns,
+            ],
+            [
+                "0100000US",
+                "children_greater_or_equal_to",
+                3,
+                "tax_unit_count",
+                three_plus_children_returns,
+            ],
+            [
+                "0100000US",
+                "children_equal_to",
+                0,
+                "eitc",
+                zero_children_amount,
+            ],
+            ["0100000US", "children_equal_to", 1, "eitc", one_child_returns],
+            [
+                "0100000US",
+                "children_equal_to",
+                2,
+                "eitc",
+                two_children_returns,
+            ],
+            [
+                "0100000US",
+                "children_greater_or_equal_to",
+                3,
+                "eitc",
+                three_plus_children_returns,
+            ],
+        ]
+    )
+
+    df_long.columns = [
+        "ucgid",
+        "constraint",
+        "constraint_value",
+        "variable",
+        "value",
+    ]
+
     df_long["period"] = 2020
     df_long["reform_id"] = 0
     df_long["source_id"] = 2
@@ -81,7 +131,7 @@ def load_eitc_data(df_long):
     Session = sessionmaker(bind=engine)
     session = Session()
 
-    ucgid = df_long.iloc[0]['ucgid']
+    ucgid = df_long.iloc[0]["ucgid"]
     for num_children in [0, 1, 2, 3]:
         note = f"eitc_child_count: {num_children}, Geo: {ucgid}"
         new_stratum = Stratum(
@@ -99,30 +149,34 @@ def load_eitc_data(df_long):
         if num_children <= 2:
             new_stratum.constraints_rel.append(
                 StratumConstraint(
-                        constraint_variable="eitc_child_count",
-                        operation="equals",
-                        value=str(num_children),
+                    constraint_variable="eitc_child_count",
+                    operation="equals",
+                    value=str(num_children),
                 ),
             )
         elif num_children > 2:
             new_stratum.constraints_rel.append(
                 StratumConstraint(
-                        constraint_variable="eitc_child_count",
-                        operation="greater_or_equal_than",
-                        value=str(3),
+                    constraint_variable="eitc_child_count",
+                    operation="greater_or_equal_than",
+                    value=str(3),
                 ),
             )
 
-        rows = df_long.loc[df_long['constraint_value'] == num_children]
-        count_target = rows.loc[rows.variable == 'tax_unit_count']['value'].values[0]
-        amount_target = rows.loc[rows.variable == 'eitc']['value'].values[0]
+        rows = df_long.loc[df_long["constraint_value"] == num_children]
+        count_target = rows.loc[rows.variable == "tax_unit_count"][
+            "value"
+        ].values[0]
+        amount_target = rows.loc[rows.variable == "eitc"]["value"].values[0]
 
         # Avoiding magic numbers in the load step
-        count_active = rows.loc[rows.variable == 'tax_unit_count']['active'].values[0]
-        amount_active = rows.loc[rows.variable == 'eitc']['active'].values[0]
+        count_active = rows.loc[rows.variable == "tax_unit_count"][
+            "active"
+        ].values[0]
+        amount_active = rows.loc[rows.variable == "eitc"]["active"].values[0]
 
-        period = rows.iloc[0]['period']
-        source_id = rows.iloc[0]['source_id']
+        period = rows.iloc[0]["period"]
+        source_id = rows.iloc[0]["source_id"]
 
         new_stratum.targets_rel = [
             Target(

From c2dd4af41b0792866acfe676aa8f15098a26635f Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Tue, 29 Jul 2025 09:09:18 -0400
Subject: [PATCH 03/27] changelog_entry.yaml

---
 changelog_entry.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 6331425b..5bd54961 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,5 +1,4 @@
 - bump: minor
   changes:
     added:
-    - Added creation script to build relational database for targets
-    - Refactored age targets load script to load the database 
+    - load script for eitc targets

From 95a4a9a48450098c1cfe1a204a7a1dbb0ef4450e Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sat, 2 Aug 2025 09:53:51 -0400
Subject: [PATCH 04/27] new file in progress

---
 policyengine_us_data/db/load_soi_targets.py | 607 ++++++++++++++++++++
 1 file changed, 607 insertions(+)
 create mode 100644 policyengine_us_data/db/load_soi_targets.py

diff --git a/policyengine_us_data/db/load_soi_targets.py b/policyengine_us_data/db/load_soi_targets.py
new file mode 100644
index 00000000..74e3dd1d
--- /dev/null
+++ b/policyengine_us_data/db/load_soi_targets.py
@@ -0,0 +1,607 @@
+# This is the file where we actually get the SOI information that we want:
+
+# Goal: start with raw AGI and EITC:
+# Data Dictionary: https://www.irs.gov/pub/irs-soi/22incddocguide.docx
+# The Data: https://www.irs.gov/pub/irs-soi/22incd.csv
+
+from pathlib import Path
+
+from typing import Optional, Union
+
+import numpy as np
+import pandas as pd
+import logging
+
+from policyengine_us_data.storage import CALIBRATION_FOLDER
+
+logger = logging.getLogger(__name__)
+
+"""Utilities to pull AGI targets from the IRS SOI data files."""
+
+# Congressional districts have one fewer level than the national and state
+# They're missing the million plus category
+#  ("No AGI Stub") is a specific, intentional category used by the IRS in its summary data files.
+SOI_COLUMNS = [
+    "Under $1",
+    "$1 under $10,000",
+    "$10,000 under $25,000",
+    "$25,000 under $50,000",
+    "$50,000 under $75,000",
+    "$75,000 under $100,000",
+    "$100,000 under $200,000",
+    "$200,000 under $500,000",
+    "$500,000 or more",
+]
+
+AGI_STUB_TO_BAND = {i + 1: band for i, band in enumerate(SOI_COLUMNS)}
+
+AGI_BOUNDS = {
+    "Under $1": (-np.inf, 1),
+    "$1 under $10,000": (1, 10_000),
+    "$10,000 under $25,000": (10_000, 25_000),
+    "$25,000 under $50,000": (25_000, 50_000),
+    "$50,000 under $75,000": (50_000, 75_000),
+    "$75,000 under $100,000": (75_000, 100_000),
+    "$100,000 under $200,000": (100_000, 200_000),
+    "$200,000 under $500,000": (200_000, 500_000),
+    "$500,000 or more": (500_000, np.inf),
+}
+
+#NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"}
+
+IGNORE_GEO_IDS = {
+    "0400000US72",  # Puerto Rico (state level)
+    "5001800US7298",  # Puerto Rico
+    "5001800US6098",  # American Samoa
+    "5001800US6698",  # Guam
+    "5001800US6998",  # Northern Mariana Islands
+    "5001800US7898",  # U.S. Virgin Islands
+}
+
+# after skipping the first 7 rows, the national SOI file has targets as row indices [COUNT_INDEX, AMOUNT_INDEX]
+NATIONAL_VARIABLES = {
+    "adjusted_gross_income": [0, 17],
+}
+
+# the state and district SOI file have targets as column names [COUNT_COL_NAME, AMOUNT_COL_NAME]
+GEOGRAPHY_VARIABLES = {"adjusted_gross_income": ["N1", "A00100"]}
+
+STATE_ABBR_TO_FIPS = {
+    "AL": "01",
+    "AK": "02",
+    "AZ": "04",
+    "AR": "05",
+    "CA": "06",
+    "CO": "08",
+    "CT": "09",
+    "DC": "11",
+    "DE": "10",
+    "FL": "12",
+    "GA": "13",
+    "HI": "15",
+    "ID": "16",
+    "IL": "17",
+    "IN": "18",
+    "IA": "19",
+    "KS": "20",
+    "KY": "21",
+    "LA": "22",
+    "ME": "23",
+    "MD": "24",
+    "MA": "25",
+    "MI": "26",
+    "MN": "27",
+    "MS": "28",
+    "MO": "29",
+    "MT": "30",
+    "NE": "31",
+    "NV": "32",
+    "NH": "33",
+    "NJ": "34",
+    "NM": "35",
+    "NY": "36",
+    "NC": "37",
+    "ND": "38",
+    "OH": "39",
+    "OK": "40",
+    "OR": "41",
+    "PA": "42",
+    "RI": "44",
+    "SC": "45",
+    "SD": "46",
+    "TN": "47",
+    "TX": "48",
+    "UT": "49",
+    "VT": "50",
+    "VA": "51",
+    "WA": "53",
+    "WV": "54",
+    "WI": "55",
+    "WY": "56",
+}
+FIPS_TO_STATE_ABBR = {v: k for k, v in STATE_ABBR_TO_FIPS.items()}
+
+
+def pull_national_soi_variable(
+    soi_variable_ident: int,  # the national SOI xlsx file has a row for each target variable
+    variable_name: Union[str, None],
+    is_count: bool,
+    national_df: Optional[pd.DataFrame] = None,
+) -> pd.DataFrame:
+    """Download and save national AGI totals."""
+    df = pd.read_excel(
+        "https://www.irs.gov/pub/irs-soi/22in54us.xlsx", skiprows=7
+    )
+
+    assert (
+        np.abs(
+            df.iloc[soi_variable_ident, 1]
+            - df.iloc[soi_variable_ident, 2:12].sum()
+        )
+        < 100
+    ), "Row 0 doesn't add up — check the file."
+
+    agi_values = df.iloc[soi_variable_ident, 2:12].astype(int).to_numpy()
+    agi_values = np.concatenate(
+        [agi_values[:8], [agi_values[8] + agi_values[9]]]
+    )
+
+    agi_brackets = [
+        AGI_STUB_TO_BAND[i] for i in range(1, len(SOI_COLUMNS) + 1)
+    ]
+
+    result = pd.DataFrame(
+        {
+            "GEO_ID": ["0100000US"] * len(agi_brackets),
+            "GEO_NAME": ["national"] * len(agi_brackets),
+            "LOWER_BOUND": [AGI_BOUNDS[b][0] for b in agi_brackets],
+            "UPPER_BOUND": [AGI_BOUNDS[b][1] for b in agi_brackets],
+            "VALUE": agi_values,
+        }
+    )
+
+    # final column order
+    result = result[
+        ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]
+    ]
+    result["IS_COUNT"] = int(is_count)
+    result["VARIABLE"] = variable_name
+
+    result["VALUE"] = np.where(
+        result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"]
+    )
+
+    if national_df is not None:
+        # If a DataFrame is passed, we append the new data to it.
+        df = pd.concat([national_df, result], ignore_index=True)
+        return df
+
+    return result
+
+
+def pull_state_soi_variable(
+    soi_variable_ident: str,  # the state SOI csv file has a column for each target variable
+    variable_name: Union[str, None],
+    is_count: bool,
+    state_df: Optional[pd.DataFrame] = None,
+) -> pd.DataFrame:
+    """Download and save state AGI totals."""
+    df = pd.read_csv(
+        "https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv", thousands=","
+    )
+
+    merged = (
+        df[df["AGI_STUB"].isin([9, 10])]
+        .groupby("STATE", as_index=False)
+        .agg({soi_variable_ident: "sum"})
+        .assign(AGI_STUB=9)
+    )
+    df = df[~df["AGI_STUB"].isin([9, 10])]
+    df = pd.concat([df, merged], ignore_index=True)
+    df = df[df["AGI_STUB"] != 0]
+
+    df["agi_bracket"] = df["AGI_STUB"].map(AGI_STUB_TO_BAND)
+
+    df["state_abbr"] = df["STATE"]
+    df["GEO_ID"] = "0400000US" + df["state_abbr"].map(STATE_ABBR_TO_FIPS)
+    df["GEO_NAME"] = "state_" + df["state_abbr"]
+
+    result = df.loc[
+        ~df["STATE"].isin(NON_VOTING_STATES.union({"US"})),
+        ["GEO_ID", "GEO_NAME", "agi_bracket", soi_variable_ident],
+    ].rename(columns={soi_variable_ident: "VALUE"})
+
+    result["LOWER_BOUND"] = result["agi_bracket"].map(
+        lambda b: AGI_BOUNDS[b][0]
+    )
+    result["UPPER_BOUND"] = result["agi_bracket"].map(
+        lambda b: AGI_BOUNDS[b][1]
+    )
+
+    # final column order
+    result = result[
+        ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]
+    ]
+    result["IS_COUNT"] = int(is_count)
+    result["VARIABLE"] = variable_name
+
+    result["VALUE"] = np.where(
+        result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"]
+    )
+
+    if state_df is not None:
+        # If a DataFrame is passed, we append the new data to it.
+        df = pd.concat([state_df, result], ignore_index=True)
+        return df
+
+    return result
+
+
+def extract_soi_data() -> pd.DataFrame:
+    """Download and save congressional district AGI totals.
+
+    In the file below, "22" is 2022, "in" is individual returns,
+    "cd" is congressional districts
+
+    """
+    return pd.read_csv("https://www.irs.gov/pub/irs-soi/22incd.csv")
+
+
+raw_df = df
+# a "stub" is a term the IRS uses for a predefined category or group, specifically an income bracket.
+
+def transform_soi_data(raw_df)
+
+    # agi_stub is only 0, so there are only agi breakdowns at the state level
+    # So you can confirm summability for 0 and then forget that national exists
+    # Honestly I think that's a better idea in general. If your states don't add
+    # Up to your national, something's off and you should treat it as an immediate
+    # problem to fix rather than something to be adjusted
+    national_df = raw_df.copy().loc[
+        (raw_df.STATE == "US")
+    ]
+
+    # You've got agi_stub == 0 in here, which you want to use any time you don't want to
+    # break things up by AGI
+    state_df = raw_df.copy().loc[
+        (raw_df.STATE != "US") &
+        (raw_df.CONG_DISTRICT == 0)
+    ]
+
+    # This is going to fail because we're missing the single cong district states
+    district_df = raw_df.copy().loc[
+        (raw_df.CONG_DISTRICT > 0)
+    ]
+
+    max_cong_district_by_state = raw_df.groupby('STATE')['CONG_DISTRICT'].transform('max')
+    district_df = raw_df.copy().loc[
+        (raw_df['CONG_DISTRICT'] > 0) | (max_cong_district_by_state == 0)
+    ]
+    district_df = district_df.loc[district_df['STATE'] != 'US']
+
+    assert district_df.shape[0] % 436 == 0
+
+    # And you've got everything you need for all 3 levels of targets from this guy
+
+    # So I want to get 2 variable categories out of this thing, in long format
+    # 1) EITC, and 2) AGI
+    # There's eitc_child_count, eitc. There's person_count and tax_unit_count
+    # but no household_count. That's why you're doing this though, for a great example
+    # Wide (a new variable per number of children) or Long (breakdown variable is number of children)
+
+
+    district_df["STATEFIPS"] = district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2)
+    district_df["CONG_DISTRICT"] = (
+        district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2)
+    )
+    district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"]
+
+    district_df = district_df[~district_df["ucgid_str"].isin(IGNORE_GEO_IDS)]
+
+    # eitc: you'll only want to take agi_stub = 0 cases
+
+    district_marginals = district_df.copy().loc[district_df.agi_stub == 0]
+    assert district_marginals.shape[0] == 436
+
+    eitc_no_children = district_marginals.copy()[['ucgid_str', 'N59661', 'A59661']].rename({
+        'N59661': 'tax_unit_count',
+        'A59661': 'eitc'
+    }, axis = 1)
+
+    eitc_no_children['eitc_children'] = 0
+
+    eitc_one_child = district_marginals.copy()[['ucgid_str', 'N59662', 'A59662']].rename({
+        'N59662': 'tax_unit_count',
+        'A59662': 'eitc'
+    }, axis=1)
+    eitc_one_child['eitc_children'] = 1
+    
+    eitc_two_children = district_marginals.copy()[['ucgid_str', 'N59663', 'A59663']].rename({
+        'N59663': 'tax_unit_count',
+        'A59663': 'eitc'
+    }, axis=1)
+    eitc_two_children['eitc_children'] = 2
+    
+    eitc_three_plus_children = district_marginals.copy()[['ucgid_str', 'N59664', 'A59664']].rename({
+        'N59664': 'tax_unit_count',
+        'A59664': 'eitc'
+    }, axis=1)
+    eitc_three_plus_children['eitc_children'] = '3+'
+
+    # Question: so many: why do this processing at the district level, since the structure is the same all over?
+    # OR, is it? At least the renaming is.
+    # Keep going for now and see how much you can generalize
+
+
+
+
+    at_large_states = (
+        district_df.groupby("STATEFIPS")["CONG_DISTRICT"]
+        .nunique()
+        .pipe(lambda s: s[s == 1].index)
+    )
+    district_df = district_df.loc[
+        (district_df["CONG_DISTRICT"] != "00") | (district_df["STATEFIPS"].isin(at_large_states))
+    ].reset_index(drop=True)
+
+    district_df["GEO_NAME"] = "district_" + (
+        f"{district_df['STATEFIPS'].map(FIPS_TO_STATE_ABBR)}-{district_df['CONG_DISTRICT']}"
+    )
+
+    district_df["agi_bracket"] = district_df["agi_stub"].map(AGI_STUB_TO_BAND)
+
+    district_df
+
+    result = df[
+        [
+            "GEO_ID",
+            "GEO_NAME",
+            "CONG_DISTRICT",
+            "STATE",
+            "agi_bracket",
+            soi_variable_ident,
+        ]
+    ].rename(columns={soi_variable_ident: "VALUE"})
+
+    result["LOWER_BOUND"] = result["agi_bracket"].map(
+        lambda b: AGI_BOUNDS[b][0]
+    )
+    result["UPPER_BOUND"] = result["agi_bracket"].map(
+        lambda b: AGI_BOUNDS[b][1]
+    )
+
+    # if redistrict:
+    # result = apply_redistricting(result, variable_name)
+
+    assert df["GEO_ID"].nunique() == 436
+
+    if redistrict:
+        # After redistricting, validate against the new district codes from the mapping
+        mapping_df = pd.read_csv(CALIBRATION_FOLDER / "district_mapping.csv")
+        valid_district_codes = set(mapping_df["code_new"].unique())
+
+        # Check that all GEO_IDs are valid
+        produced_codes = set(result["GEO_ID"])
+        invalid_codes = produced_codes - valid_district_codes
+        assert (
+            not invalid_codes
+        ), f"Invalid district codes after redistricting: {invalid_codes}"
+
+        # Check we have exactly 436 districts
+        assert (
+            len(produced_codes) == 436
+        ), f"Expected 436 districts after redistricting, got {len(produced_codes)}"
+
+        # Check that all GEO_IDs successfully mapped to names
+        missing_names = result[result["GEO_NAME"].isna()]["GEO_ID"].unique()
+        assert (
+            len(missing_names) == 0
+        ), f"GEO_IDs without names in ID_TO_NAME mapping: {missing_names}"
+
+    # final column order
+    result = result[
+        ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]
+    ]
+    result["IS_COUNT"] = int(is_count)
+    result["VARIABLE"] = variable_name
+
+    result["VALUE"] = np.where(
+        result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"]
+    )
+
+    if district_df is not None:
+        # If a DataFrame is passed, we append the new data to it.
+        df = pd.concat([district_df, result], ignore_index=True)
+        return df
+
+    return result
+
+
+def _get_soi_data(geo_level: str) -> pd.DataFrame:
+    """
+    geo_level ∈ {'National', 'State', 'District'}
+    Returns a DataFrame with all SOI variables for the specified geography level
+    """
+    if geo_level == "National":
+        var_indices = NATIONAL_VARIABLES
+        variable_pull = pull_national_soi_variable
+    elif geo_level == "State":
+        var_indices = GEOGRAPHY_VARIABLES
+        variable_pull = pull_state_soi_variable
+    elif geo_level == "District":
+        var_indices = GEOGRAPHY_VARIABLES
+        variable_pull = pull_district_soi_variable
+    else:
+        raise ValueError("geo_level must be National, State or District")
+
+    df = pd.DataFrame()
+    for variable, identifiers in var_indices.items():
+        count_id, amount_id = identifiers
+        # Pull count data (first identifier)
+        count_df = variable_pull(
+            soi_variable_ident=count_id,
+            variable_name=variable,
+            is_count=float(True),
+        )
+        df = pd.concat([df, count_df], ignore_index=True)
+        # Pull amount data (second identifier)
+        amount_df = variable_pull(
+            soi_variable_ident=amount_id,
+            variable_name=variable,
+            is_count=float(False),
+        )
+        df = pd.concat([df, amount_df], ignore_index=True)
+
+    return df
+
+
+def combine_geography_levels(districts: Optional[bool] = False) -> None:
+    """Combine SOI data across geography levels with validation and rescaling."""
+    national = _get_soi_data("National")
+    state = _get_soi_data("State")
+    if districts:
+        district = _get_soi_data("District")
+
+    # Add state FIPS codes for validation
+    state["STATEFIPS"] = state["GEO_ID"].str[-2:]
+    if districts:
+        district["STATEFIPS"] = district["GEO_ID"].str[-4:-2]
+
+    # Get unique variables and AGI brackets for iteration
+    variables = national["VARIABLE"].unique()
+    agi_brackets = national[["LOWER_BOUND", "UPPER_BOUND"]].drop_duplicates()
+
+    # Validate and rescale state totals against national totals
+    for variable in variables:
+        for is_count in [0.0, 1.0]:  # Process count and amount separately
+            for _, bracket in agi_brackets.iterrows():
+                lower, upper = (
+                    bracket["LOWER_BOUND"],
+                    bracket["UPPER_BOUND"],
+                )
+
+                # Get national total for this variable/bracket/type combination
+                nat_mask = (
+                    (national["VARIABLE"] == variable)
+                    & (national["LOWER_BOUND"] == lower)
+                    & (national["UPPER_BOUND"] == upper)
+                    & (national["IS_COUNT"] == is_count)
+                )
+                us_total = national.loc[nat_mask, "VALUE"].iloc[0]
+
+                # Get state total for this variable/bracket/type combination
+                state_mask = (
+                    (state["VARIABLE"] == variable)
+                    & (state["LOWER_BOUND"] == lower)
+                    & (state["UPPER_BOUND"] == upper)
+                    & (state["IS_COUNT"] == is_count)
+                )
+                state_total = state.loc[state_mask, "VALUE"].sum()
+
+                # Rescale states if they don't match national total
+                if not np.isclose(state_total, us_total, rtol=1e-3):
+                    count_type = "count" if is_count == 1.0 else "amount"
+                    logger.warning(
+                        f"States' sum does not match national total for {variable}/{count_type} "
+                        f"in bracket [{lower}, {upper}]. Rescaling state targets."
+                    )
+                    state.loc[state_mask, "VALUE"] *= us_total / state_total
+
+    if districts:
+        # Validate and rescale district totals against state totals
+        for variable in variables:
+            for is_count in [0.0, 1.0]:  # Process count and amount separately
+                for _, bracket in agi_brackets.iterrows():
+                    lower, upper = (
+                        bracket["LOWER_BOUND"],
+                        bracket["UPPER_BOUND"],
+                    )
+
+                    # Create masks for this variable/bracket/type combination
+                    state_mask = (
+                        (state["VARIABLE"] == variable)
+                        & (state["LOWER_BOUND"] == lower)
+                        & (state["UPPER_BOUND"] == upper)
+                        & (state["IS_COUNT"] == is_count)
+                    )
+                    district_mask = (
+                        (district["VARIABLE"] == variable)
+                        & (district["LOWER_BOUND"] == lower)
+                        & (district["UPPER_BOUND"] == upper)
+                        & (district["IS_COUNT"] == is_count)
+                    )
+
+                # Get state totals indexed by STATEFIPS
+                state_totals = state.loc[state_mask].set_index("STATEFIPS")[
+                    "VALUE"
+                ]
+
+                # Get district totals grouped by STATEFIPS
+                district_totals = (
+                    district.loc[district_mask]
+                    .groupby("STATEFIPS")["VALUE"]
+                    .sum()
+                )
+
+                # Check and rescale districts for each state
+                for fips, d_total in district_totals.items():
+                    s_total = state_totals.get(fips)
+
+                    if s_total is not None and not np.isclose(
+                        d_total, s_total, rtol=1e-3
+                    ):
+                        count_type = "count" if is_count == 1.0 else "amount"
+                        logger.warning(
+                            f"Districts' sum does not match {fips} state total for {variable}/{count_type} "
+                            f"in bracket [{lower}, {upper}]. Rescaling district targets."
+                        )
+                        rescale_mask = district_mask & (
+                            district["STATEFIPS"] == fips
+                        )
+                        district.loc[rescale_mask, "VALUE"] *= (
+                            s_total / d_total
+                        )
+
+    # Combine all data
+    combined = pd.concat(
+        [
+            national,
+            state.drop(columns="STATEFIPS"),
+            (
+                district.drop(columns="STATEFIPS")
+                if districts
+                else pd.DataFrame(columns=national.columns)
+            ),
+        ],
+        ignore_index=True,
+    ).sort_values(["GEO_ID", "VARIABLE", "LOWER_BOUND"])
+
+    combined["DATA_SOURCE"] = "soi"
+    combined["BREAKDOWN_VARIABLE"] = "adjusted_gross_income"
+
+    combined = combined[
+        [
+            "DATA_SOURCE",
+            "GEO_ID",
+            "GEO_NAME",
+            "VARIABLE",
+            "VALUE",
+            "IS_COUNT",
+            "BREAKDOWN_VARIABLE",
+            "LOWER_BOUND",
+            "UPPER_BOUND",
+        ]
+    ]
+
+    # Save combined data
+    out_path = CALIBRATION_FOLDER / "soi.csv"
+    combined.to_csv(out_path, index=False)
+    logger.info(f"Combined SOI targets saved to {out_path}")
+
+
+def main() -> None:
+    combine_geography_levels()
+
+
+if __name__ == "__main__":
+    main()

From 6fd3542998cbec24cb0cf2d096840a7e51715eee Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Wed, 6 Aug 2025 23:39:41 -0400
Subject: [PATCH 05/27] moving to QBID and SALT

---
 policyengine_us_data/db/load_age_targets.py |  22 +-
 policyengine_us_data/db/load_soi_targets.py | 416 ++++++++++----------
 2 files changed, 219 insertions(+), 219 deletions(-)

diff --git a/policyengine_us_data/db/load_age_targets.py b/policyengine_us_data/db/load_age_targets.py
index b93c7687..b588c922 100644
--- a/policyengine_us_data/db/load_age_targets.py
+++ b/policyengine_us_data/db/load_age_targets.py
@@ -174,18 +174,18 @@ def transform_age_data(age_data, docs):
     )
 
     df = df.drop(columns="NAME")
-    df = df.rename({"GEO_ID": "ucgid"}, axis=1)
-    df_data = df.rename(columns=rename_mapping)[["ucgid"] + list(AGE_COLS)]
+    df = df.rename({"GEO_ID": "ucgid_str"}, axis=1)
+    df_data = df.rename(columns=rename_mapping)[["ucgid_str"] + list(AGE_COLS)]
 
     # Filter out Puerto Rico's district and state records, if needed
     df_geos = df_data[
-        ~df_data["ucgid"].isin(["5001800US7298", "0400000US72"])
+        ~df_data["ucgid_str"].isin(["5001800US7298", "0400000US72"])
     ].copy()
 
-    df = df_geos[["ucgid"] + AGE_COLS]
+    df = df_geos[["ucgid_str"] + AGE_COLS]
 
     df_long = df.melt(
-        id_vars="ucgid",
+        id_vars="ucgid_str",
         value_vars=AGE_COLS,
         var_name="age_range",
         value_name="value",
@@ -212,11 +212,11 @@ def load_age_data(df_long, geo, stratum_lookup={}):
 
     # Quick data quality check before loading ----
     if geo == "National":
-        assert len(set(df_long.ucgid)) == 1
+        assert len(set(df_long.ucgid_str)) == 1
     elif geo == "State":
-        assert len(set(df_long.ucgid)) == 51
+        assert len(set(df_long.ucgid_str)) == 51
     elif geo == "District":
-        assert len(set(df_long.ucgid)) == 436
+        assert len(set(df_long.ucgid_str)) == 436
     else:
         raise ValueError('geo must be one of "National", "State", "District"')
 
@@ -238,7 +238,7 @@ def load_age_data(df_long, geo, stratum_lookup={}):
 
         # Create the parent Stratum object.
         # We will attach children to it before adding it to the session.
-        note = f"Age: {row['age_range']}, Geo: {row['ucgid']}"
+        note = f"Age: {row['age_range']}, Geo: {row['ucgid_str']}"
         parent_geo = get_parent_geo(geo)
         parent_stratum_id = (
             stratum_lookup[parent_geo][row["age_range"]]
@@ -253,9 +253,9 @@ def load_age_data(df_long, geo, stratum_lookup={}):
         # Create constraints and link them to the parent's relationship attribute.
         new_stratum.constraints_rel = [
             StratumConstraint(
-                constraint_variable="ucgid",
+                constraint_variable="ucgid_str",
                 operation="equals",
-                value=row["ucgid"],
+                value=row["ucgid_str"],
             ),
             StratumConstraint(
                 constraint_variable="age",
diff --git a/policyengine_us_data/db/load_soi_targets.py b/policyengine_us_data/db/load_soi_targets.py
index 74e3dd1d..b564bbf8 100644
--- a/policyengine_us_data/db/load_soi_targets.py
+++ b/policyengine_us_data/db/load_soi_targets.py
@@ -122,119 +122,128 @@
 FIPS_TO_STATE_ABBR = {v: k for k, v in STATE_ABBR_TO_FIPS.items()}
 
 
-def pull_national_soi_variable(
-    soi_variable_ident: int,  # the national SOI xlsx file has a row for each target variable
-    variable_name: Union[str, None],
-    is_count: bool,
-    national_df: Optional[pd.DataFrame] = None,
-) -> pd.DataFrame:
-    """Download and save national AGI totals."""
-    df = pd.read_excel(
-        "https://www.irs.gov/pub/irs-soi/22in54us.xlsx", skiprows=7
-    )
-
-    assert (
-        np.abs(
-            df.iloc[soi_variable_ident, 1]
-            - df.iloc[soi_variable_ident, 2:12].sum()
-        )
-        < 100
-    ), "Row 0 doesn't add up — check the file."
-
-    agi_values = df.iloc[soi_variable_ident, 2:12].astype(int).to_numpy()
-    agi_values = np.concatenate(
-        [agi_values[:8], [agi_values[8] + agi_values[9]]]
-    )
-
-    agi_brackets = [
-        AGI_STUB_TO_BAND[i] for i in range(1, len(SOI_COLUMNS) + 1)
-    ]
-
-    result = pd.DataFrame(
-        {
-            "GEO_ID": ["0100000US"] * len(agi_brackets),
-            "GEO_NAME": ["national"] * len(agi_brackets),
-            "LOWER_BOUND": [AGI_BOUNDS[b][0] for b in agi_brackets],
-            "UPPER_BOUND": [AGI_BOUNDS[b][1] for b in agi_brackets],
-            "VALUE": agi_values,
-        }
-    )
-
-    # final column order
-    result = result[
-        ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]
-    ]
-    result["IS_COUNT"] = int(is_count)
-    result["VARIABLE"] = variable_name
-
-    result["VALUE"] = np.where(
-        result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"]
-    )
-
-    if national_df is not None:
-        # If a DataFrame is passed, we append the new data to it.
-        df = pd.concat([national_df, result], ignore_index=True)
-        return df
-
-    return result
-
-
-def pull_state_soi_variable(
-    soi_variable_ident: str,  # the state SOI csv file has a column for each target variable
-    variable_name: Union[str, None],
-    is_count: bool,
-    state_df: Optional[pd.DataFrame] = None,
-) -> pd.DataFrame:
-    """Download and save state AGI totals."""
-    df = pd.read_csv(
-        "https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv", thousands=","
-    )
-
-    merged = (
-        df[df["AGI_STUB"].isin([9, 10])]
-        .groupby("STATE", as_index=False)
-        .agg({soi_variable_ident: "sum"})
-        .assign(AGI_STUB=9)
-    )
-    df = df[~df["AGI_STUB"].isin([9, 10])]
-    df = pd.concat([df, merged], ignore_index=True)
-    df = df[df["AGI_STUB"] != 0]
-
-    df["agi_bracket"] = df["AGI_STUB"].map(AGI_STUB_TO_BAND)
-
-    df["state_abbr"] = df["STATE"]
-    df["GEO_ID"] = "0400000US" + df["state_abbr"].map(STATE_ABBR_TO_FIPS)
-    df["GEO_NAME"] = "state_" + df["state_abbr"]
-
-    result = df.loc[
-        ~df["STATE"].isin(NON_VOTING_STATES.union({"US"})),
-        ["GEO_ID", "GEO_NAME", "agi_bracket", soi_variable_ident],
-    ].rename(columns={soi_variable_ident: "VALUE"})
-
-    result["LOWER_BOUND"] = result["agi_bracket"].map(
-        lambda b: AGI_BOUNDS[b][0]
-    )
-    result["UPPER_BOUND"] = result["agi_bracket"].map(
-        lambda b: AGI_BOUNDS[b][1]
-    )
-
-    # final column order
-    result = result[
-        ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]
-    ]
-    result["IS_COUNT"] = int(is_count)
-    result["VARIABLE"] = variable_name
-
-    result["VALUE"] = np.where(
-        result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"]
-    )
-
-    if state_df is not None:
-        # If a DataFrame is passed, we append the new data to it.
-        df = pd.concat([state_df, result], ignore_index=True)
-        return df
-
-    return result
+#def pull_national_soi_variable(
+#    soi_variable_ident: int,  # the national SOI xlsx file has a row for each target variable
+#    variable_name: Union[str, None],
+#    is_count: bool,
+#    national_df: Optional[pd.DataFrame] = None,
+#) -> pd.DataFrame:
+#    """Download and save national AGI totals."""
+#    df = pd.read_excel(
+#        "https://www.irs.gov/pub/irs-soi/22in54us.xlsx", skiprows=7
+#    )
+#
+#    assert (
+#        np.abs(
+#            df.iloc[soi_variable_ident, 1]
+#            - df.iloc[soi_variable_ident, 2:12].sum()
+#        )
+#        < 100
+#    ), "Row 0 doesn't add up — check the file."
+#
+#    agi_values = df.iloc[soi_variable_ident, 2:12].astype(int).to_numpy()
+#    agi_values = np.concatenate(
+#        [agi_values[:8], [agi_values[8] + agi_values[9]]]
+#    )
+#
+#    agi_brackets = [
+#        AGI_STUB_TO_BAND[i] for i in range(1, len(SOI_COLUMNS) + 1)
+#    ]
+#
+#    result = pd.DataFrame(
+#        {
+#            "GEO_ID": ["0100000US"] * len(agi_brackets),
+#            "GEO_NAME": ["national"] * len(agi_brackets),
+#            "LOWER_BOUND": [AGI_BOUNDS[b][0] for b in agi_brackets],
+#            "UPPER_BOUND": [AGI_BOUNDS[b][1] for b in agi_brackets],
+#            "VALUE": agi_values,
+#        }
+#    )
+#
+#    # final column order
+#    result = result[
+#        ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]
+#    ]
+#    result["IS_COUNT"] = int(is_count)
+#    result["VARIABLE"] = variable_name
+#
+#    result["VALUE"] = np.where(
+#        result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"]
+#    )
+#
+#    if national_df is not None:
+#        # If a DataFrame is passed, we append the new data to it.
+#        df = pd.concat([national_df, result], ignore_index=True)
+#        return df
+#
+#    return result
+#
+#
+#def pull_state_soi_variable(
+#    soi_variable_ident: str,  # the state SOI csv file has a column for each target variable
+#    variable_name: Union[str, None],
+#    is_count: bool,
+#    state_df: Optional[pd.DataFrame] = None,
+#) -> pd.DataFrame:
+#    """Download and save state AGI totals."""
+#    df = pd.read_csv(
+#        "https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv", thousands=","
+#    )
+#
+#    merged = (
+#        df[df["AGI_STUB"].isin([9, 10])]
+#        .groupby("STATE", as_index=False)
+#        .agg({soi_variable_ident: "sum"})
+#        .assign(AGI_STUB=9)
+#    )
+#    df = df[~df["AGI_STUB"].isin([9, 10])]
+#    df = pd.concat([df, merged], ignore_index=True)
+#    df = df[df["AGI_STUB"] != 0]
+#
+#    df["agi_bracket"] = df["AGI_STUB"].map(AGI_STUB_TO_BAND)
+#
+#    df["state_abbr"] = df["STATE"]
+#    df["GEO_ID"] = "0400000US" + df["state_abbr"].map(STATE_ABBR_TO_FIPS)
+#    df["GEO_NAME"] = "state_" + df["state_abbr"]
+#
+#    result = df.loc[
+#        ~df["STATE"].isin(NON_VOTING_STATES.union({"US"})),
+#        ["GEO_ID", "GEO_NAME", "agi_bracket", soi_variable_ident],
+#    ].rename(columns={soi_variable_ident: "VALUE"})
+#
+#    result["LOWER_BOUND"] = result["agi_bracket"].map(
+#        lambda b: AGI_BOUNDS[b][0]
+#    )
+#    result["UPPER_BOUND"] = result["agi_bracket"].map(
+#        lambda b: AGI_BOUNDS[b][1]
+#    )
+#
+#    # final column order
+#    result = result[
+#        ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]
+#    ]
+#    result["IS_COUNT"] = int(is_count)
+#    result["VARIABLE"] = variable_name
+#
+#    result["VALUE"] = np.where(
+#        result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"]
+#    )
+#
+#    if state_df is not None:
+#        # If a DataFrame is passed, we append the new data to it.
+#        df = pd.concat([state_df, result], ignore_index=True)
+#        return df
+#
+#    return result
+
+def create_records(df, breakdown_variable, target_variable):
+    """Transforms a DataFrame subset into a standardized list of records."""
+    temp_df = df[["ucgid_str"]].copy()
+    temp_df["breakdown_variable"] = breakdown_variable 
+    temp_df["breakdown_value"] = df[breakdown_variable]
+    temp_df["target_variable"] = target_variable 
+    temp_df["target_value"] = df[target_variable]
+    return temp_df
 
 
 def extract_soi_data() -> pd.DataFrame:
@@ -242,16 +251,16 @@ def extract_soi_data() -> pd.DataFrame:
 
     In the file below, "22" is 2022, "in" is individual returns,
     "cd" is congressional districts
-
     """
     return pd.read_csv("https://www.irs.gov/pub/irs-soi/22incd.csv")
 
 
-raw_df = df
+raw_df = extract_soi_data()
 # a "stub" is a term the IRS uses for a predefined category or group, specifically an income bracket.
 
 def transform_soi_data(raw_df)
 
+
     # agi_stub is only 0, so there are only agi breakdowns at the state level
     # So you can confirm summability for 0 and then forget that national exists
     # Honestly I think that's a better idea in general. If your states don't add
@@ -260,6 +269,7 @@ def transform_soi_data(raw_df)
     national_df = raw_df.copy().loc[
         (raw_df.STATE == "US")
     ]
+    national_df["ucgid_str"] = "0100000US"
 
     # You've got agi_stub == 0 in here, which you want to use any time you don't want to
     # break things up by AGI
@@ -267,6 +277,7 @@ def transform_soi_data(raw_df)
         (raw_df.STATE != "US") &
         (raw_df.CONG_DISTRICT == 0)
     ]
+    state_df["ucgid_str"] = "0400000US" + state_df["STATEFIPS"].astype(str).str.zfill(2)
 
     # This is going to fail because we're missing the single cong district states
     district_df = raw_df.copy().loc[
@@ -278,10 +289,21 @@ def transform_soi_data(raw_df)
         (raw_df['CONG_DISTRICT'] > 0) | (max_cong_district_by_state == 0)
     ]
     district_df = district_df.loc[district_df['STATE'] != 'US']
+    district_df["STATEFIPS"] = district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2)
+    district_df["CONG_DISTRICT"] = (
+        district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2)
+    )
+    district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"]
+    district_df = district_df[~district_df["ucgid_str"].isin(IGNORE_GEO_IDS)]
 
     assert district_df.shape[0] % 436 == 0
 
-    # And you've got everything you need for all 3 levels of targets from this guy
+    # And you've got everything you need for all 3 levels of targets:
+    #  1. national_df
+    #  2. state_df
+    #  3. district_df
+    
+    all_df = pd.concat([national_df, state_df, district_df])
 
     # So I want to get 2 variable categories out of this thing, in long format
     # 1) EITC, and 2) AGI
@@ -289,132 +311,110 @@ def transform_soi_data(raw_df)
     # but no household_count. That's why you're doing this though, for a great example
     # Wide (a new variable per number of children) or Long (breakdown variable is number of children)
 
+    # Marginal in terms of AGI, which this data set is organized with respect to 
+    all_marginals = all_df.copy().loc[all_df.agi_stub == 0]
+    assert all_marginals.shape[0] == 436 + 51 + 1
 
-    district_df["STATEFIPS"] = district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2)
-    district_df["CONG_DISTRICT"] = (
-        district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2)
-    )
-    district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"]
-
-    district_df = district_df[~district_df["ucgid_str"].isin(IGNORE_GEO_IDS)]
+    # Collect targets from the SOI file
+    records = []
 
-    # eitc: you'll only want to take agi_stub = 0 cases
-
-    district_marginals = district_df.copy().loc[district_df.agi_stub == 0]
-    assert district_marginals.shape[0] == 436
-
-    eitc_no_children = district_marginals.copy()[['ucgid_str', 'N59661', 'A59661']].rename({
+    # EITC ---------------------------------------------------------------------------
+    eitc_no_children = all_marginals.copy().rename({
         'N59661': 'tax_unit_count',
         'A59661': 'eitc'
     }, axis = 1)
-
     eitc_no_children['eitc_children'] = 0
 
-    eitc_one_child = district_marginals.copy()[['ucgid_str', 'N59662', 'A59662']].rename({
+    records.append(
+        create_records(eitc_no_children, "eitc_children", "tax_unit_count")
+    )
+    records.append(
+        create_records(eitc_no_children, "eitc_children", "eitc")
+    )
+
+    eitc_one_child = all_marginals.copy().rename({
         'N59662': 'tax_unit_count',
         'A59662': 'eitc'
     }, axis=1)
     eitc_one_child['eitc_children'] = 1
-    
-    eitc_two_children = district_marginals.copy()[['ucgid_str', 'N59663', 'A59663']].rename({
+
+    records.append(
+        create_records(eitc_one_child, "eitc_children", "tax_unit_count")
+    )
+    records.append(
+        create_records(eitc_one_child, "eitc_children", "eitc")
+    )
+
+    eitc_two_children = all_marginals.copy().rename({
         'N59663': 'tax_unit_count',
         'A59663': 'eitc'
     }, axis=1)
     eitc_two_children['eitc_children'] = 2
-    
-    eitc_three_plus_children = district_marginals.copy()[['ucgid_str', 'N59664', 'A59664']].rename({
+
+    records.append(
+        create_records(eitc_two_children, "eitc_children", "tax_unit_count")
+    )
+    records.append(
+        create_records(eitc_two_children, "eitc_children", "eitc")
+    )
+
+    eitc_three_plus_children = all_marginals.copy().rename({
         'N59664': 'tax_unit_count',
         'A59664': 'eitc'
     }, axis=1)
     eitc_three_plus_children['eitc_children'] = '3+'
 
-    # Question: so many: why do this processing at the district level, since the structure is the same all over?
-    # OR, is it? At least the renaming is.
-    # Keep going for now and see how much you can generalize
+    records.append(
+        create_records(eitc_three_plus_children, "eitc_children", "tax_unit_count")
+    )
+    records.append(
+        create_records(eitc_three_plus_children, "eitc_children", "eitc")
+    )
 
+    # QBID ----------------------------------------------------------------------
+    qbid = all_marginals.copy().rename({
+        'N59664': 'tax_unit_count',
+        'A59664': 'qbid'
+    }, axis=1)
+    # No breakdown variable other than the geo here
+    qbid['one'] = 1
 
+    records.append(
+        create_records(qbid, "one", "tax_unit_count")
+    )
+    records.append(
+        create_records(qbid, "one", "qbid")
+    )
 
+    # SALT -----------------------------------------------------------------------
 
-    at_large_states = (
-        district_df.groupby("STATEFIPS")["CONG_DISTRICT"]
-        .nunique()
-        .pipe(lambda s: s[s == 1].index)
-    )
-    district_df = district_df.loc[
-        (district_df["CONG_DISTRICT"] != "00") | (district_df["STATEFIPS"].isin(at_large_states))
-    ].reset_index(drop=True)
+    # TODO: THERE's definitely a pattern here
+    # TODO: you forgot to multiply by 1000!
+    # For all the files, the money amounts are reported in thousands of dollars.
+    salt = all_marginals.copy().rename({
+        'N18425': 'tax_unit_count',
+        'A18425': 'salt'
+    }, axis=1)
+    salt['one'] = 1
 
-    district_df["GEO_NAME"] = "district_" + (
-        f"{district_df['STATEFIPS'].map(FIPS_TO_STATE_ABBR)}-{district_df['CONG_DISTRICT']}"
+    records.append(
+        create_records(salt, "one", "tax_unit_count")
+    )
+    records.append(
+        create_records(qbid, "one", "salt")
     )
 
-    district_df["agi_bracket"] = district_df["agi_stub"].map(AGI_STUB_TO_BAND)
 
-    district_df
+    return records
+
+
+
 
-    result = df[
-        [
-            "GEO_ID",
-            "GEO_NAME",
-            "CONG_DISTRICT",
-            "STATE",
-            "agi_bracket",
-            soi_variable_ident,
-        ]
-    ].rename(columns={soi_variable_ident: "VALUE"})
 
-    result["LOWER_BOUND"] = result["agi_bracket"].map(
-        lambda b: AGI_BOUNDS[b][0]
-    )
-    result["UPPER_BOUND"] = result["agi_bracket"].map(
-        lambda b: AGI_BOUNDS[b][1]
-    )
 
-    # if redistrict:
-    # result = apply_redistricting(result, variable_name)
-
-    assert df["GEO_ID"].nunique() == 436
-
-    if redistrict:
-        # After redistricting, validate against the new district codes from the mapping
-        mapping_df = pd.read_csv(CALIBRATION_FOLDER / "district_mapping.csv")
-        valid_district_codes = set(mapping_df["code_new"].unique())
-
-        # Check that all GEO_IDs are valid
-        produced_codes = set(result["GEO_ID"])
-        invalid_codes = produced_codes - valid_district_codes
-        assert (
-            not invalid_codes
-        ), f"Invalid district codes after redistricting: {invalid_codes}"
-
-        # Check we have exactly 436 districts
-        assert (
-            len(produced_codes) == 436
-        ), f"Expected 436 districts after redistricting, got {len(produced_codes)}"
-
-        # Check that all GEO_IDs successfully mapped to names
-        missing_names = result[result["GEO_NAME"].isna()]["GEO_ID"].unique()
-        assert (
-            len(missing_names) == 0
-        ), f"GEO_IDs without names in ID_TO_NAME mapping: {missing_names}"
-
-    # final column order
-    result = result[
-        ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]
-    ]
-    result["IS_COUNT"] = int(is_count)
-    result["VARIABLE"] = variable_name
 
-    result["VALUE"] = np.where(
-        result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"]
-    )
 
-    if district_df is not None:
-        # If a DataFrame is passed, we append the new data to it.
-        df = pd.concat([district_df, result], ignore_index=True)
-        return df
 
-    return result
 
 
 def _get_soi_data(geo_level: str) -> pd.DataFrame:

From c73ef870ca30da35dd7f3fa511ec13db517af359 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Thu, 7 Aug 2025 13:50:48 -0400
Subject: [PATCH 06/27] new variables added

---
 policyengine_us_data/db/load_soi_targets.py | 249 ++++++++++++--------
 1 file changed, 157 insertions(+), 92 deletions(-)

diff --git a/policyengine_us_data/db/load_soi_targets.py b/policyengine_us_data/db/load_soi_targets.py
index b564bbf8..2fe3fa91 100644
--- a/policyengine_us_data/db/load_soi_targets.py
+++ b/policyengine_us_data/db/load_soi_targets.py
@@ -5,8 +5,7 @@
 # The Data: https://www.irs.gov/pub/irs-soi/22incd.csv
 
 from pathlib import Path
-
-from typing import Optional, Union
+from typing import List, Optional, Sequence, Dict, Tuple, Any, Union
 
 import numpy as np
 import pandas as pd
@@ -246,6 +245,97 @@ def create_records(df, breakdown_variable, target_variable):
     return temp_df
 
 
+def make_records(
+    df: pd.DataFrame,
+    *,
+    count_col: str,
+    amount_col: str,
+    amount_name: str,
+    breakdown_col: Optional[str] = None,
+    multiplier: int = 1_000,
+):
+    df = (
+        df.rename({count_col: "tax_unit_count",
+                   amount_col: amount_name},
+                  axis=1)
+          .copy()
+    )
+
+    if breakdown_col is None:
+        breakdown_col = "one"
+        df[breakdown_col] = 1
+
+    rec_counts  = create_records(df, breakdown_col, "tax_unit_count")
+    rec_amounts = create_records(df, breakdown_col, amount_name)
+    rec_amounts["target_value"] *= multiplier  # Only the amounts get * 1000
+    rec_counts["target_variable"] = f"{amount_name}_tax_unit_count"
+
+    return rec_counts, rec_amounts
+
+
+
+_TARGET_COL_MAP = {
+    "N1":     "agi_tax_unit_count",   # number of returns (≈ “tax units”)
+    "N2":     "agi_person_count",     # number of individuals
+    "A00100": "agi_total_amount",     # total Adjusted Gross Income
+}
+
+_BREAKDOWN_FIELD = "agi_stub"        # numeric AGI stub 1‑10 from IRS
+_BREAKDOWN_NAME  = "agi_stub"        # what will go in `breakdown_variable`
+
+def make_agi_long(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Convert IRS SOI AGI‑split table from wide to the long format used
+    in your `records[*]` list.
+    
+    Parameters
+    ----------
+    df : DataFrame
+        Must contain `ucgid_str`, `agi_stub` and the three IRS fields
+        in `_TARGET_COL_MAP` (N1, N2, A00100).
+    
+    Returns
+    -------
+    DataFrame with columns:
+        ucgid_str
+        breakdown_variable   (always "agi_stub")
+        breakdown_value      (1‑10)
+        target_variable      ("agi_tax_unit_count" | "agi_person_count" | "agi_total_amount")
+        target_value         (float)
+    """
+    # — keep only what we need and rename for clarity
+    work = (
+        df[["ucgid_str", _BREAKDOWN_FIELD] + list(_TARGET_COL_MAP)]
+          .rename(columns=_TARGET_COL_MAP)     # N1 → agi_tax_unit_count, etc.
+    )
+
+    # — wide → long
+    long = (
+        work.melt(
+            id_vars=["ucgid_str", _BREAKDOWN_FIELD],
+            var_name="target_variable",
+            value_name="target_value"
+        )
+        .rename(columns={_BREAKDOWN_FIELD: "breakdown_value"})
+        .assign(breakdown_variable=_BREAKDOWN_NAME)
+        # Optional: add a human‑readable band label if useful
+        # .assign(breakdown_label=lambda d: d["breakdown_value"].map(AGI_STUB_TO_BAND))
+    )
+
+    # — final column order
+    long = long[["ucgid_str",
+                 "breakdown_variable",
+                 "breakdown_value",
+                 "target_variable",
+                 "target_value"]]
+
+    # consistently sort (purely cosmetic)
+    return (
+        long.sort_values(["ucgid_str", "breakdown_value", "target_variable"])
+            .reset_index(drop=True)
+    )
+
+
 def extract_soi_data() -> pd.DataFrame:
     """Download and save congressional district AGI totals.
 
@@ -258,6 +348,32 @@ def extract_soi_data() -> pd.DataFrame:
 raw_df = extract_soi_data()
 # a "stub" is a term the IRS uses for a predefined category or group, specifically an income bracket.
 
+TARGETS = [
+    dict(code="59661", name="eitc", breakdown=("eitc_children", 0)),
+    dict(code="59662", name="eitc", breakdown=("eitc_children", 1)),
+    dict(code="59663", name="eitc", breakdown=("eitc_children", 2)),
+    dict(code="59664", name="eitc", breakdown=("eitc_children", "3+")),
+    dict(code="59664", name="qbid", breakdown=None),
+    dict(code="18500", name="real_estate_taxes", breakdown=None),
+    dict(code="01000", name="net_capital_gain", breakdown=None),
+    dict(code="03150", name="ira_payments", breakdown=None),
+    dict(code="00300", name="taxable_interest", breakdown=None),
+    dict(code="00400", name="tax_exempt_interest", breakdown=None),
+    dict(code="00600", name="oridinary_dividends", breakdown=None),
+    dict(code="00650", name="qualified_dividends", breakdown=None),
+    dict(code="26270", name="partnership_and_s_crop_net_income", breakdown=None),
+    dict(code="02500", name="total_social_security", breakdown=None),
+    dict(code="01700", name="pension_and_annuities", breakdown=None),
+    dict(code="02300", name="unemployment_compensation", breakdown=None),
+    dict(code="00900", name="business_net_income", breakdown=None),
+    dict(code="17000", name="medical_and_dental_deduction", breakdown=None),
+    dict(code="00700", name="salt_refunds", breakdown=None),
+    dict(code="18425", name="salt_amount", breakdown=None),
+    dict(code="06500", name="income_tax", breakdown=None),
+]
+
+
+
 def transform_soi_data(raw_df)
 
 
@@ -317,104 +433,53 @@ def transform_soi_data(raw_df)
 
     # Collect targets from the SOI file
     records = []
-
-    # EITC ---------------------------------------------------------------------------
-    eitc_no_children = all_marginals.copy().rename({
-        'N59661': 'tax_unit_count',
-        'A59661': 'eitc'
-    }, axis = 1)
-    eitc_no_children['eitc_children'] = 0
-
-    records.append(
-        create_records(eitc_no_children, "eitc_children", "tax_unit_count")
-    )
-    records.append(
-        create_records(eitc_no_children, "eitc_children", "eitc")
-    )
-
-    eitc_one_child = all_marginals.copy().rename({
-        'N59662': 'tax_unit_count',
-        'A59662': 'eitc'
-    }, axis=1)
-    eitc_one_child['eitc_children'] = 1
-
-    records.append(
-        create_records(eitc_one_child, "eitc_children", "tax_unit_count")
-    )
-    records.append(
-        create_records(eitc_one_child, "eitc_children", "eitc")
-    )
-
-    eitc_two_children = all_marginals.copy().rename({
-        'N59663': 'tax_unit_count',
-        'A59663': 'eitc'
-    }, axis=1)
-    eitc_two_children['eitc_children'] = 2
-
-    records.append(
-        create_records(eitc_two_children, "eitc_children", "tax_unit_count")
-    )
-    records.append(
-        create_records(eitc_two_children, "eitc_children", "eitc")
-    )
-
-    eitc_three_plus_children = all_marginals.copy().rename({
-        'N59664': 'tax_unit_count',
-        'A59664': 'eitc'
-    }, axis=1)
-    eitc_three_plus_children['eitc_children'] = '3+'
-
-    records.append(
-        create_records(eitc_three_plus_children, "eitc_children", "tax_unit_count")
-    )
-    records.append(
-        create_records(eitc_three_plus_children, "eitc_children", "eitc")
-    )
-
-    # QBID ----------------------------------------------------------------------
-    qbid = all_marginals.copy().rename({
-        'N59664': 'tax_unit_count',
-        'A59664': 'qbid'
-    }, axis=1)
-    # No breakdown variable other than the geo here
-    qbid['one'] = 1
-
-    records.append(
-        create_records(qbid, "one", "tax_unit_count")
-    )
-    records.append(
-        create_records(qbid, "one", "qbid")
-    )
-
-    # SALT -----------------------------------------------------------------------
-
-    # TODO: THERE's definitely a pattern here
-    # TODO: you forgot to multiply by 1000!
-    # For all the files, the money amounts are reported in thousands of dollars.
-    salt = all_marginals.copy().rename({
-        'N18425': 'tax_unit_count',
-        'A18425': 'salt'
-    }, axis=1)
-    salt['one'] = 1
-
-    records.append(
-        create_records(salt, "one", "tax_unit_count")
-    )
-    records.append(
-        create_records(qbid, "one", "salt")
-    )
-
-
-    return records
-
+    for spec in TARGETS:
+        count_col  = f"N{spec['code']}"   # e.g. 'N59661'
+        amount_col = f"A{spec['code']}"   # e.g. 'A59661'
+    
+        df = all_marginals.copy()
+    
+        if spec["breakdown"] is not None:
+            col, val = spec["breakdown"]
+            df[col] = val
+            breakdown_col = col
+        else:
+            breakdown_col = None
+    
+        rec_counts, rec_amounts = make_records(
+            df,
+            count_col   = count_col,
+            amount_col  = amount_col,
+            amount_name = spec["name"],
+            breakdown_col = breakdown_col,
+            multiplier  = 1_000,
+        )
+        records.extend([rec_counts, rec_amounts])
 
 
+    # Custom AGI amount, which doesn't have a count column (it has N1 and N2)
+    temp_df = df[["ucgid_str"]].copy()
+    temp_df["breakdown_variable"] = "one" 
+    temp_df["breakdown_value"] = 1 
+    temp_df["target_variable"] = "agi"
+    temp_df["target_value"] = df["A00100"] * 1_000
 
+    records.append(temp_df)
 
+    # It's notable that the national counts only have agi_stub = 0
+    all_agi_splits = all_df.copy().loc[all_df.agi_stub != 0]
+    assert all_agi_splits.shape[0] % (436 + 51 + 0) == 0
 
+    # Still a bit of work to do at the time of loading, since the breakdown variable
+    # is agi_stub
+    agi_long = make_agi_long(all_agi_splits)
 
+    # We have the distribution and the total amount, let's not go crazy here
+    agi_long = agi_long.loc[agi_long.target_variable != "agi_total_amount"] 
 
+    records.append(agi_long)
 
+    return pd.concat(records)
 
 
 def _get_soi_data(geo_level: str) -> pd.DataFrame:

From 57d98501e9502ac94e41cb9c6aebc31d730360b4 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 8 Aug 2025 17:00:03 -0400
Subject: [PATCH 07/27] medicaid etl file

---
 policyengine_us_data/db/etl_medicaid.py | 206 ++++++++++++++++++++++++
 1 file changed, 206 insertions(+)
 create mode 100644 policyengine_us_data/db/etl_medicaid.py

diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py
new file mode 100644
index 00000000..71786fa0
--- /dev/null
+++ b/policyengine_us_data/db/etl_medicaid.py
@@ -0,0 +1,206 @@
+import requests
+import pandas as pd
+
+
+
+# This is from another file
+#def extract_docs(year=2023):
+#    docs_url = (
+#        f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json"
+#    )
+#
+#    try:
+#        docs_response = requests.get(docs_url)
+#        docs_response.raise_for_status()
+#
+#        docs = docs_response.json()
+#        docs["year"] = year
+#
+#    except requests.exceptions.RequestException as e:
+#        print(f"Error during API request: {e}")
+#        raise
+#    except Exception as e:
+#        print(f"An error occurred: {e}")
+#        raise
+#    return docs
+
+
+
+# State abbreviation to FIPS code mapping
+state_fips_map = {
+    'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06',
+    'CO': '08', 'CT': '09', 'DE': '10', 'FL': '12', 'GA': '13',
+    'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19',
+    'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24',
+    'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28', 'MO': '29',
+    'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34',
+    'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39',
+    'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45',
+    'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50',
+    'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55', 'WY': '56',
+    'DC': '11'
+}
+
+
+
+# I can get data from:
+
+    "S2704_C02_006E": {
+      "label": "Estimate!!Public Coverage!!COVERAGE ALONE OR IN COMBINATION!!Medicaid/means-tested public coverage alone or in combination",
+      "concept": "Public Health Insurance Coverage by Type and Selected Characteristics",
+      "predicateType": "int",
+      "group": "S2704",
+      "limit": 0,
+      "attributes": "S2704_C02_006EA,S2704_C02_006M,S2704_C02_006MA"
+    },
+
+
+def extract_medicaid_data():
+    year = 2023
+    base_url = (
+        f"https://api.census.gov/data/{year}/acs/acs1/subject?get=group(S2704)"
+    )
+    url = f"{base_url}&for=congressional+district:*"
+    response = requests.get(url)
+    response.raise_for_status()
+
+    data = response.json()
+
+    headers = data[0]
+    data_rows = data[1:]
+    cd_survey_df = pd.DataFrame(data_rows, columns=headers)
+
+    item = "6165f45b-ca93-5bb5-9d06-db29c692a360"
+    response = requests.get(
+      f"https://data.medicaid.gov/api/1/metastore/schemas/dataset/items/{item}?show-reference-ids=false"
+    )
+    metadata = response.json()
+    
+    data_url = metadata['distribution'][0]['data']['downloadURL']
+    state_admin_df = pd.read_csv(data_url)
+    
+    return cd_survey_df, state_admin_df
+
+
+cd_survey_df, state_admin_df = extract_medicaid_data()
+
+def transform_medicaid_data(state_admin_df, cd_survey_df):
+    state_df = state_admin_df.loc[
+        (state_admin_df["Reporting Period"] == 202312) &
+        (state_admin_df["Final Report"] == "Y"),
+        ["State Abbreviation", "Reporting Period", "Total Medicaid Enrollment"]
+    ]
+
+    state_df["FIPS"] = state_df["State Abbreviation"].map(state_fips_map)
+
+    cd_df = cd_survey_df[["GEO_ID", "state", "congressional district", "S2704_C02_006E"]]
+
+    nc_cd_sum = cd_df.loc[cd_df.state == "37"].S2704_C02_006E.astype(int).sum()
+    nc_state_sum = state_df.loc[state_df.FIPS == '37']['Total Medicaid Enrollment'].values[0]
+    assert nc_cd_sum > .5 * nc_state_sum
+    assert nc_cd_sum <= nc_state_sum
+
+    return long_df
+
+# YOU KNOW WHAT TO DO!
+
+def load_medicaid_data():
+
+    pass
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def _geo_clause_for(geo: str) -> str:
+    if geo == "National":
+        return "for=us:*"
+    if geo == "State":
+        return "for=state:*"
+    if geo == "District":
+        # Congressional districts; adding 'in=state:*' avoids API ambiguities
+        return "for=congressional+district:*&in=state:*"
+    raise ValueError("geo must be 'National', 'State', or 'District'")
+
+
+def _group_meta(year: int, dataset: str, group: str) -> dict:
+    url = f"https://api.census.gov/data/{year}/{dataset}/groups/{group}.json"
+    r = requests.get(url, timeout=60)
+    r.raise_for_status()
+    return r.json()
+
+
+def extract_medicaid_s2701(geo: str, year: int = 2023,
+                           which: str = "estimate",
+                           by_age: bool = True) -> pd.DataFrame:
+    """
+    Pulls ACS S2701 'With Medicaid/means-tested public coverage' for the requested geography.
+      which: 'estimate' (counts) or 'percent'
+      by_age: True -> Under 19, 19-64, 65+ ; False -> all ages combined
+    Returns: tidy DataFrame with readable columns plus geo identifiers.
+    """
+    dataset = "acs/acs1/subject"
+    group = "S2701"
+    meta = _group_meta(year, dataset, group)["variables"]
+
+    target_prefix = "Estimate" if which == "estimate" else "Percent"
+    selected, rename = [], {}
+
+    for vid, v in meta.items():
+        pass
+
+        if not vid.endswith("E"):  # just the estimates
+            continue
+        label = v["label"]
+        if not label.startswith(target_prefix):
+            continue
+        ## Keep 'With Medicaid/means-tested public coverage'
+        #if "COVERAGE TYPE!!With Medicaid/means-tested public coverage" not in label:
+        #    continue
+
+        has_age = "!!AGE!!" in label
+        if by_age and not has_age:
+            continue
+        if not by_age and has_age:
+            continue
+
+        selected.append(vid)
+        nice = label.split("!!")[-1] if by_age else "All ages"
+        rename[vid] = f"Medicaid ({nice}) - {which}"
+
+    if not selected:
+        raise RuntimeError("No S2701 Medicaid variables matched. Check 'which' or 'by_age' options.")
+
+    get_vars = ["NAME"] + selected
+    url = f"https://api.census.gov/data/{year}/{dataset}?get={','.join(get_vars)}&{_geo_clause_for(geo)}"
+    r = requests.get(url, timeout=120)
+    r.raise_for_status()
+    raw = r.json()
+
+    df = pd.DataFrame(raw[1:], columns=raw[0])
+    for vid in selected:
+        df[vid] = pd.to_numeric(df[vid], errors="coerce")
+    df = df.rename(columns=rename)
+
+    # Reorder: geo columns first, then NAME, then our measures
+    geo_cols = [c for c in ["us", "state", "congressional district"] if c in df.columns]
+    measure_cols = [rename[v] for v in selected]
+    return df[geo_cols + ["NAME"] + measure_cols]
+
+
+df = extract_medicaid_s2701("District",
+                            2023,
+                            "estimate",
+                            False)

From 9c4838e5a99fbc43ac89d7ff6d2a1f3986b4ef45 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Sun, 10 Aug 2025 22:55:06 -0400
Subject: [PATCH 08/27] medicaid is loading in

---
 policyengine_us_data/db/etl_medicaid.py | 242 +++++++++++-------------
 1 file changed, 114 insertions(+), 128 deletions(-)

diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py
index 71786fa0..395bc109 100644
--- a/policyengine_us_data/db/etl_medicaid.py
+++ b/policyengine_us_data/db/etl_medicaid.py
@@ -2,28 +2,14 @@
 import pandas as pd
 
 
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
 
-# This is from another file
-#def extract_docs(year=2023):
-#    docs_url = (
-#        f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json"
-#    )
-#
-#    try:
-#        docs_response = requests.get(docs_url)
-#        docs_response.raise_for_status()
-#
-#        docs = docs_response.json()
-#        docs["year"] = year
-#
-#    except requests.exceptions.RequestException as e:
-#        print(f"Error during API request: {e}")
-#        raise
-#    except Exception as e:
-#        print(f"An error occurred: {e}")
-#        raise
-#    return docs
-
+from policyengine_us_data.db.create_database_tables import (
+    Stratum,
+    StratumConstraint,
+    Target,
+)
 
 
 # State abbreviation to FIPS code mapping
@@ -42,17 +28,14 @@
 }
 
 
-
-# I can get data from:
-
-    "S2704_C02_006E": {
-      "label": "Estimate!!Public Coverage!!COVERAGE ALONE OR IN COMBINATION!!Medicaid/means-tested public coverage alone or in combination",
-      "concept": "Public Health Insurance Coverage by Type and Selected Characteristics",
-      "predicateType": "int",
-      "group": "S2704",
-      "limit": 0,
-      "attributes": "S2704_C02_006EA,S2704_C02_006M,S2704_C02_006MA"
-    },
+#"S2704_C02_006E": {
+#  "label": "Estimate!!Public Coverage!!COVERAGE ALONE OR IN COMBINATION!!Medicaid/means-tested public coverage alone or in combination",
+#  "concept": "Public Health Insurance Coverage by Type and Selected Characteristics",
+#  "predicateType": "int",
+#  "group": "S2704",
+#  "limit": 0,
+#  "attributes": "S2704_C02_006EA,S2704_C02_006M,S2704_C02_006MA"
+#},
 
 
 def extract_medicaid_data():
@@ -82,8 +65,6 @@ def extract_medicaid_data():
     return cd_survey_df, state_admin_df
 
 
-cd_survey_df, state_admin_df = extract_medicaid_data()
-
 def transform_medicaid_data(state_admin_df, cd_survey_df):
     state_df = state_admin_df.loc[
         (state_admin_df["Reporting Period"] == 202312) &
@@ -100,107 +81,112 @@ def transform_medicaid_data(state_admin_df, cd_survey_df):
     assert nc_cd_sum > .5 * nc_state_sum
     assert nc_cd_sum <= nc_state_sum
 
-    return long_df
-
-# YOU KNOW WHAT TO DO!
-
-def load_medicaid_data():
-
-    pass
-
-
-
-
-
-
-
-
-
+    state_df = state_df.rename(columns={'Total Medicaid Enrollment': 'medicaid_enrollment'})
+    state_df['ucgid_str'] = '0400000US' + state_df['FIPS'].astype(str)
 
+    cd_df = cd_df.rename(columns={'S2704_C02_006E': 'medicaid_enrollment', 'GEO_ID': 'ucgid_str'})
+    cd_df = cd_df.loc[cd_df.state != '72']
 
+    out_cols = ['ucgid_str', 'medicaid_enrollment']
+    return state_df[out_cols], cd_df[out_cols]
 
 
+def load_medicaid_data(long_state, long_cd):
 
+    DATABASE_URL = "sqlite:///policy_data.db"
+    engine = create_engine(DATABASE_URL)
 
+    Session = sessionmaker(bind=engine)
+    session = Session()
 
+    stratum_lookup = {}
 
-def _geo_clause_for(geo: str) -> str:
-    if geo == "National":
-        return "for=us:*"
-    if geo == "State":
-        return "for=state:*"
-    if geo == "District":
-        # Congressional districts; adding 'in=state:*' avoids API ambiguities
-        return "for=congressional+district:*&in=state:*"
-    raise ValueError("geo must be 'National', 'State', or 'District'")
-
-
-def _group_meta(year: int, dataset: str, group: str) -> dict:
-    url = f"https://api.census.gov/data/{year}/{dataset}/groups/{group}.json"
-    r = requests.get(url, timeout=60)
-    r.raise_for_status()
-    return r.json()
-
-
-def extract_medicaid_s2701(geo: str, year: int = 2023,
-                           which: str = "estimate",
-                           by_age: bool = True) -> pd.DataFrame:
-    """
-    Pulls ACS S2701 'With Medicaid/means-tested public coverage' for the requested geography.
-      which: 'estimate' (counts) or 'percent'
-      by_age: True -> Under 19, 19-64, 65+ ; False -> all ages combined
-    Returns: tidy DataFrame with readable columns plus geo identifiers.
-    """
-    dataset = "acs/acs1/subject"
-    group = "S2701"
-    meta = _group_meta(year, dataset, group)["variables"]
-
-    target_prefix = "Estimate" if which == "estimate" else "Percent"
-    selected, rename = [], {}
-
-    for vid, v in meta.items():
-        pass
-
-        if not vid.endswith("E"):  # just the estimates
-            continue
-        label = v["label"]
-        if not label.startswith(target_prefix):
-            continue
-        ## Keep 'With Medicaid/means-tested public coverage'
-        #if "COVERAGE TYPE!!With Medicaid/means-tested public coverage" not in label:
-        #    continue
-
-        has_age = "!!AGE!!" in label
-        if by_age and not has_age:
-            continue
-        if not by_age and has_age:
-            continue
-
-        selected.append(vid)
-        nice = label.split("!!")[-1] if by_age else "All ages"
-        rename[vid] = f"Medicaid ({nice}) - {which}"
-
-    if not selected:
-        raise RuntimeError("No S2701 Medicaid variables matched. Check 'which' or 'by_age' options.")
-
-    get_vars = ["NAME"] + selected
-    url = f"https://api.census.gov/data/{year}/{dataset}?get={','.join(get_vars)}&{_geo_clause_for(geo)}"
-    r = requests.get(url, timeout=120)
-    r.raise_for_status()
-    raw = r.json()
-
-    df = pd.DataFrame(raw[1:], columns=raw[0])
-    for vid in selected:
-        df[vid] = pd.to_numeric(df[vid], errors="coerce")
-    df = df.rename(columns=rename)
-
-    # Reorder: geo columns first, then NAME, then our measures
-    geo_cols = [c for c in ["us", "state", "congressional district"] if c in df.columns]
-    measure_cols = [rename[v] for v in selected]
-    return df[geo_cols + ["NAME"] + measure_cols]
+    # Wow, the first time we're making these geos with no breakdown variable
 
+    # National ----------------
+    nat_stratum = Stratum(
+        parent_stratum_id=None, stratum_group_id=0, notes="Geo: 0100000US"
+    )
+    nat_stratum.constraints_rel = [
+        StratumConstraint(
+            constraint_variable="ucgid_str",
+            operation="in",
+            value="0100000US",
+        )
+    ]
 
-df = extract_medicaid_s2701("District",
-                            2023,
-                            "estimate",
-                            False)
+    session.add(nat_stratum)
+    session.flush()
+    stratum_lookup["National"] = nat_stratum.stratum_id
+
+    # State -------------------
+    stratum_lookup["State"] = {} 
+    for _, row in long_state.iterrows():
+
+        note = f"Geo: {row['ucgid_str']}"
+        parent_stratum_id = nat_stratum.stratum_id
+
+        new_stratum = Stratum(
+            parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note
+        )
+        new_stratum.constraints_rel = [
+            StratumConstraint(
+                constraint_variable="ucgid_str",
+                operation="in",
+                value=row["ucgid_str"],
+            ),
+        ]
+        new_stratum.targets_rel.append(
+            Target(
+                variable="medicaid_enrollment",
+                period=2023,
+                value=row["medicaid_enrollment"],
+                source_id=2,
+                active=True,
+            )
+        )
+        session.add(new_stratum)
+        session.flush()
+        stratum_lookup["State"][row['ucgid_str']] = new_stratum.stratum_id
+
+
+    # District -------------------
+    stratum_lookup["District"] = {} 
+    for _, row in long_cd.iterrows():
+
+        note = f"Geo: {row['ucgid_str']}"
+        parent_stratum_id = stratum_lookup["State"][f'0400000US{row["ucgid_str"][-4:-2]}']
+
+        new_stratum = Stratum(
+            parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note
+        )
+        new_stratum.constraints_rel = [
+            StratumConstraint(
+                constraint_variable="ucgid_str",
+                operation="in",
+                value=row["ucgid_str"],
+            ),
+        ]
+        new_stratum.targets_rel.append(
+            Target(
+                variable="medicaid_enrollment",
+                period=2023,
+                value=row["medicaid_enrollment"],
+                source_id=2,
+                active=True,
+            )
+        )
+        session.add(new_stratum)
+        session.flush()
+
+
+    session.commit()
+
+    return stratum_lookup
+
+if __name__ == "__main__":
+    cd_survey_df, state_admin_df = extract_medicaid_data()
+
+    long_state, long_cd = transform_medicaid_data(state_admin_df, cd_survey_df)
+
+    load_medicaid_data(long_state, long_cd)

From 57716f2088b470c06475273b757b2890690b65a9 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Mon, 11 Aug 2025 23:18:08 -0400
Subject: [PATCH 09/27] medicaid and some SNAP data

---
 policyengine_us_data/db/etl_medicaid.py |  79 +++--
 policyengine_us_data/db/etl_snap.py     | 438 ++++++++++++++++++++++++
 policyengine_us_data/utils/census.py    |  42 +++
 3 files changed, 522 insertions(+), 37 deletions(-)
 create mode 100644 policyengine_us_data/db/etl_snap.py
 create mode 100644 policyengine_us_data/utils/census.py

diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py
index 395bc109..d1babe31 100644
--- a/policyengine_us_data/db/etl_medicaid.py
+++ b/policyengine_us_data/db/etl_medicaid.py
@@ -1,7 +1,6 @@
 import requests
-import pandas as pd
-
 
+import pandas as pd
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 
@@ -28,18 +27,7 @@
 }
 
 
-#"S2704_C02_006E": {
-#  "label": "Estimate!!Public Coverage!!COVERAGE ALONE OR IN COMBINATION!!Medicaid/means-tested public coverage alone or in combination",
-#  "concept": "Public Health Insurance Coverage by Type and Selected Characteristics",
-#  "predicateType": "int",
-#  "group": "S2704",
-#  "limit": 0,
-#  "attributes": "S2704_C02_006EA,S2704_C02_006M,S2704_C02_006MA"
-#},
-
-
-def extract_medicaid_data():
-    year = 2023
+def extract_medicaid_data(year):
     base_url = (
         f"https://api.census.gov/data/{year}/acs/acs1/subject?get=group(S2704)"
     )
@@ -65,9 +53,12 @@ def extract_medicaid_data():
     return cd_survey_df, state_admin_df
 
 
-def transform_medicaid_data(state_admin_df, cd_survey_df):
+def transform_medicaid_data(state_admin_df, cd_survey_df, year):
+
+    reporting_period = year * 100 + 12
+    print(f"Reporting period is {reporting_period}")
     state_df = state_admin_df.loc[
-        (state_admin_df["Reporting Period"] == 202312) &
+        (state_admin_df["Reporting Period"] == reporting_period) &
         (state_admin_df["Final Report"] == "Y"),
         ["State Abbreviation", "Reporting Period", "Total Medicaid Enrollment"]
     ]
@@ -91,29 +82,34 @@ def transform_medicaid_data(state_admin_df, cd_survey_df):
     return state_df[out_cols], cd_df[out_cols]
 
 
-def load_medicaid_data(long_state, long_cd):
+def load_medicaid_data(long_state, long_cd, year):
 
     DATABASE_URL = "sqlite:///policy_data.db"
     engine = create_engine(DATABASE_URL)
+    year = 2023
 
     Session = sessionmaker(bind=engine)
     session = Session()
 
     stratum_lookup = {}
 
-    # Wow, the first time we're making these geos with no breakdown variable
-
     # National ----------------
     nat_stratum = Stratum(
-        parent_stratum_id=None, stratum_group_id=0, notes="Geo: 0100000US"
+        parent_stratum_id=None, stratum_group_id=0, notes="Geo: 0100000US Medicaid Enrolled"
     )
     nat_stratum.constraints_rel = [
         StratumConstraint(
             constraint_variable="ucgid_str",
-            operation="in",
+            operation="equals",
             value="0100000US",
-        )
+        ),
+        StratumConstraint(
+            constraint_variable="medicaid_enrolled",
+            operation="equals",
+            value="True",
+        ),
     ]
+    # No target at the national level is provided at this time.
 
     session.add(nat_stratum)
     session.flush()
@@ -123,7 +119,7 @@ def load_medicaid_data(long_state, long_cd):
     stratum_lookup["State"] = {} 
     for _, row in long_state.iterrows():
 
-        note = f"Geo: {row['ucgid_str']}"
+        note = f"Geo: {row['ucgid_str']} Medicaid Enrolled"
         parent_stratum_id = nat_stratum.stratum_id
 
         new_stratum = Stratum(
@@ -132,14 +128,19 @@ def load_medicaid_data(long_state, long_cd):
         new_stratum.constraints_rel = [
             StratumConstraint(
                 constraint_variable="ucgid_str",
-                operation="in",
+                operation="equals",
                 value=row["ucgid_str"],
             ),
+            StratumConstraint(
+                constraint_variable="medicaid_enrolled",
+                operation="equals",
+                value="True",
+            ),
         ]
         new_stratum.targets_rel.append(
             Target(
-                variable="medicaid_enrollment",
-                period=2023,
+                variable="person_count",
+                period=year,
                 value=row["medicaid_enrollment"],
                 source_id=2,
                 active=True,
@@ -149,12 +150,10 @@ def load_medicaid_data(long_state, long_cd):
         session.flush()
         stratum_lookup["State"][row['ucgid_str']] = new_stratum.stratum_id
 
-
     # District -------------------
-    stratum_lookup["District"] = {} 
     for _, row in long_cd.iterrows():
 
-        note = f"Geo: {row['ucgid_str']}"
+        note = f"Geo: {row['ucgid_str']} Medicaid Enrolled"
         parent_stratum_id = stratum_lookup["State"][f'0400000US{row["ucgid_str"][-4:-2]}']
 
         new_stratum = Stratum(
@@ -163,14 +162,19 @@ def load_medicaid_data(long_state, long_cd):
         new_stratum.constraints_rel = [
             StratumConstraint(
                 constraint_variable="ucgid_str",
-                operation="in",
+                operation="equals",
                 value=row["ucgid_str"],
             ),
+            StratumConstraint(
+                constraint_variable="medicaid_enrolled",
+                operation="equals",
+                value="True",
+            ),
         ]
         new_stratum.targets_rel.append(
             Target(
-                variable="medicaid_enrollment",
-                period=2023,
+                variable="person_count",
+                period=year,
                 value=row["medicaid_enrollment"],
                 source_id=2,
                 active=True,
@@ -179,14 +183,15 @@ def load_medicaid_data(long_state, long_cd):
         session.add(new_stratum)
         session.flush()
 
-
     session.commit()
 
-    return stratum_lookup
 
 if __name__ == "__main__":
-    cd_survey_df, state_admin_df = extract_medicaid_data()
 
-    long_state, long_cd = transform_medicaid_data(state_admin_df, cd_survey_df)
+    year = 2023
+
+    cd_survey_df, state_admin_df = extract_medicaid_data(year)
+
+    long_state, long_cd = transform_medicaid_data(state_admin_df, cd_survey_df, year)
 
-    load_medicaid_data(long_state, long_cd)
+    load_medicaid_data(long_state, long_cd, year)
diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py
new file mode 100644
index 00000000..a82da744
--- /dev/null
+++ b/policyengine_us_data/db/etl_snap.py
@@ -0,0 +1,438 @@
+import requests
+import zipfile
+import io
+import os
+import re
+from pathlib import Path
+
+import pandas as pd
+import numpy as np
+import us
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+
+from policyengine_us_data.db.create_database_tables import (
+    Stratum,
+    StratumConstraint,
+    Target,
+)
+from policyengine_us_data.utils.census import (
+    get_census_docs,
+    pull_subject_table,
+)
+
+
+STATE_NAME_TO_FIPS = {
+    "Alabama": "01",
+    "Alaska": "02",
+    "Arizona": "04",
+    "Arkansas": "05",
+    "California": "06",
+    "Colorado": "08",
+    "Connecticut": "09",
+    "Delaware": "10",
+    "District of Columbia": "11",
+    "Florida": "12",
+    "Georgia": "13",
+    "Hawaii": "15",
+    "Idaho": "16",
+    "Illinois": "17",
+    "Indiana": "18",
+    "Iowa": "19",
+    "Kansas": "20",
+    "Kentucky": "21",
+    "Louisiana": "22",
+    "Maine": "23",
+    "Maryland": "24",
+    "Massachusetts": "25",
+    "Michigan": "26",
+    "Minnesota": "27",
+    "Mississippi": "28",
+    "Missouri": "29",
+    "Montana": "30",
+    "Nebraska": "31",
+    "Nevada": "32",
+    "New Hampshire": "33",
+    "New Jersey": "34",
+    "New Mexico": "35",
+    "New York": "36",
+    "North Carolina": "37",
+    "North Dakota": "38",
+    "Ohio": "39",
+    "Oklahoma": "40",
+    "Oregon": "41",
+    "Pennsylvania": "42",
+    "Rhode Island": "44",
+    "South Carolina": "45",
+    "South Dakota": "46",
+    "Tennessee": "47",
+    "Texas": "48",
+    "Utah": "49",
+    "Vermont": "50",
+    "Virginia": "51",
+    "Washington": "53",
+    "West Virginia": "54",
+    "Wisconsin": "55",
+    "Wyoming": "56",
+}
+
+
+def extract_administrative_snap_data(year=2023):
+    """
+    Downloads and extracts annual state-level SNAP data from the USDA FNS zip file.
+    """
+    url = "https://www.fns.usda.gov/sites/default/files/resource-files/snap-zip-fy69tocurrent-6.zip"
+
+    # Note: extra complexity in request due to regional restrictions on downloads (e.g., Spain)
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Accept-Encoding": "gzip, deflate",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+    }
+
+    try:
+        session = requests.Session()
+        session.headers.update(headers)
+
+        # Try to visit the main page first to get any necessary cookies
+        main_page = "https://www.fns.usda.gov/pd/supplemental-nutrition-assistance-program-snap"
+        try:
+            session.get(main_page, timeout=30)
+        except:
+            pass  # Ignore errors on the main page
+
+        response = session.get(url, timeout=30, allow_redirects=True)
+        response.raise_for_status()
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading file: {e}")
+        # Try alternative URL or method
+        try:
+            alt_url = "https://www.fns.usda.gov/sites/default/files/resource-files/snap-zip-fy69tocurrent-6.zip"
+            response = session.get(alt_url, timeout=30, allow_redirects=True)
+            response.raise_for_status()
+        except requests.exceptions.RequestException as e2:
+            print(f"Alternative URL also failed: {e2}")
+            return None
+
+    return zipfile.ZipFile(io.BytesIO(response.content))
+
+
+def transform_snap_administrative_data(zip_file, year):
+    filename = f"FY{str(year)[-2:]}.xlsx"
+    with zip_file.open(filename) as f:
+        xls = pd.ExcelFile(f)
+        tab_results = []
+        for sheet_name in [
+            "NERO",
+            "MARO",
+            "SERO",
+            "MWRO",
+            "SWRO",
+            "MPRO",
+            "WRO",
+        ]:
+            df_raw = pd.read_excel(
+                xls, sheet_name=sheet_name, header=None, dtype={0: str}
+            )
+
+            state_row_mask = (
+                df_raw[0].notna()
+                & df_raw[1].isna()
+                & ~df_raw[0].str.contains("Total", na=False)
+                & ~df_raw[0].str.contains("Footnote", na=False)
+            )
+
+            df_raw["State"] = df_raw.loc[state_row_mask, 0]
+            df_raw["State"] = df_raw["State"].ffill()
+            total_rows = df_raw[df_raw[0].eq("Total")].copy()
+            total_rows = total_rows.rename(
+                columns={
+                    1: "Households",
+                    2: "Persons",
+                    3: "Cost",
+                }
+            )
+
+            state_totals = total_rows[
+                [
+                    "State",
+                    "Households",
+                    "Persons",
+                    "Cost",  # Annual (Note: the CostPer* vars are monthly)
+                ]
+            ]
+
+            tab_results.append(state_totals)
+
+    results_df = pd.concat(tab_results)
+
+    df_states = results_df.loc[
+        results_df["State"].isin(STATE_NAME_TO_FIPS.keys())
+    ].copy()
+    df_states["STATE_FIPS"] = df_states["State"].map(STATE_NAME_TO_FIPS)
+    df_states = (
+        df_states.loc[~df_states["STATE_FIPS"].isna()]
+        .sort_values("STATE_FIPS")
+        .reset_index(drop=True)
+    )
+    df_states["ucgid_str"] = "0400000US" + df_states["STATE_FIPS"]
+
+    # I don't think I need to make this long, because it's going to be 3 different variables
+    #df_states[['ucgid_str', 'Households']]
+    #df_states[['ucgid_str', 'Persons']]
+    #df_states[['ucgid_str', 'Cost']]
+
+    return df_states
+
+
+def load_snap_administrative_data(?, year):
+
+    year = 2023
+
+    DATABASE_URL = "sqlite:///policy_data.db"
+    engine = create_engine(DATABASE_URL)
+
+    Session = sessionmaker(bind=engine)
+    session = Session()
+
+    stratum_lookup = {}
+
+    # National ----------------
+    nat_stratum = Stratum(
+        parent_stratum_id=None, stratum_group_id=0, notes="Geo: 0100000US Received SNAP Benefits"
+    )
+    nat_stratum.constraints_rel = [
+        StratumConstraint(
+            constraint_variable="ucgid_str",
+            operation="equals",
+            value="0100000US",
+        ),
+        StratumConstraint(
+            constraint_variable="snap",
+            operation="is_greater_than",
+            value="0",
+        ),
+    ]
+    # No target at the national level is provided at this time.
+
+    session.add(nat_stratum)
+    session.flush()
+    stratum_lookup["National"] = nat_stratum.stratum_id
+
+    # State -------------------
+    stratum_lookup["State"] = {} 
+    for _, row in df_states.iterrows():
+
+        note = f"Geo: {row['ucgid_str']} Received SNAP Benefits"
+        parent_stratum_id = nat_stratum.stratum_id
+
+        new_stratum = Stratum(
+            parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note
+        )
+        new_stratum.constraints_rel = [
+            StratumConstraint(
+                constraint_variable="ucgid_str",
+                operation="equals",
+                value=row["ucgid_str"],
+            ),
+            StratumConstraint(
+                constraint_variable="snap",
+                operation="is_greater_than",
+                value="0",
+            ),
+        ]
+        # Two targets now. Same data source. Same stratum
+        new_stratum.targets_rel.append(
+            Target(
+                variable="household_count",
+                period=year,
+                value=row["Households"],
+                source_id=3,
+                active=True,
+            )
+        )
+        new_stratum.targets_rel.append(
+            Target(
+                variable="snap",
+                period=year,
+                value=row["Cost"],
+                source_id=3,
+                active=True,
+            )
+        )
+        session.add(new_stratum)
+        session.flush()
+        stratum_lookup["State"][row['ucgid_str']] = new_stratum.stratum_id
+
+    session.commit()
+
+
+
+# Moving  away from administrative data to get the survey data ------
+
+
+
+def extract_survey_snap_data(year):
+
+    # Household count data -----
+    data = pull_acs_table("S2201", "National", 2023)
+    data["S2201_C03_001E"]
+
+
+    # Ha, this is off my a factor of 1000, and ACS does not report dollars in 1000s
+    # TODO: try to figure it out.
+    data = pull_acs_table("B19058", "State", 2023)
+    np.sum(data["B19058_001E"].values.astype(int)) / 1E9
+
+
+
+    raw_dfs = {}
+    for geo in ["District", "State", "National"]:
+        df = pull_subject_table(group, geo, year)
+        df_data = df.rename(columns=rename_mapping)[
+            ["GEO_ID", "NAME"] + list(label_to_short_name_mapping.values())
+        ]
+        if geo == "State":
+            raw_dfs["DC"] = df_data[df_data["GEO_ID"].isin(["0400000US11"])]
+
+        # Filter out Puerto Rico
+        df_geos = df_data[
+            ~df_data["GEO_ID"].isin(
+                [
+                    "5001800US7298",
+                    "0400000US72",
+                ]
+            )
+        ].copy()
+        raw_dfs[geo] = df_geos
+        SAVE_DIR = Path(get_data_directory() / "input" / "demographics")
+        df_geos.to_csv(SAVE_DIR / f"raw_snap_{geo}.csv", index=False)
+
+    folder_path = (
+        f"{get_data_directory()}/targets/edition=raw/"
+        f"base_period={year}/reference_period={year}/"
+        f"variable=snap_households/"
+    )
+    raw_out = pd.concat([
+        raw_dfs['National'][['GEO_ID', 'overall']],
+        raw_dfs['State'][['GEO_ID', 'overall']],
+        raw_dfs['DC'][['GEO_ID', 'overall']],
+        raw_dfs['District'][['GEO_ID', 'overall']]
+    ]).rename({"GEO_ID": "geography_id", "overall": "value"}, axis=1)
+        
+    raw_out.to_csv(os.path.join(folder_path, "part-001.csv"), index=False)     
+
+    additive_dfs = enforce_geographic_self_consistency(raw_dfs, 'overall')    
+    usda_snap_df = extract_usda_snap_data()
+    adjusted_dfs = adjust_to_administrative_data(additive_dfs, 'overall', usda_snap_df)
+    assert check_geographic_consistency(adjusted_dfs, 'overall') 
+
+    folder_path = (
+        f"{get_data_directory()}/targets/edition=cleaned/"
+        f"base_period={year}/reference_period={year}/"
+        f"variable=snap_households/"
+    )
+ 
+    clean_out = pd.concat([
+        adjusted_dfs['National'][['GEO_ID', 'overall']],
+        adjusted_dfs['State'][['GEO_ID', 'overall']],
+        adjusted_dfs['DC'][['GEO_ID', 'overall']],
+        adjusted_dfs['District'][['GEO_ID', 'overall']]
+    ]).rename({"GEO_ID": "geography_id", "overall": "value"}, axis=1)
+ 
+    clean_out.to_csv(os.path.join(folder_path, "part-001.csv"), index=False)
+
+
+def reformat_cleaned_data():
+    """Temporary conversion function"""
+    benefits_dir = Path(get_data_directory() / 'input' / 'benefits')
+   
+    snap_filepath = Path(
+        get_data_directory(),
+        "targets",
+        "edition=cleaned",
+        "base_period=2023",
+        "reference_period=2023",
+        "variable=snap_households",
+        "part-001.csv"
+    )
+    snap_data = pd.read_csv(snap_filepath)
+    geo_hierarchies = pd.read_csv(Path(get_data_directory(), 'meta', 'geo_hierarchies.csv'))
+    
+    # Use Type II SCD to Filter geo_hierarchies for the year 2023
+    geo_hierarchies['start_date'] = pd.to_datetime(geo_hierarchies['start_date'])
+    geo_hierarchies['end_date'] = pd.to_datetime(geo_hierarchies['end_date'])
+    geo_hierarchies_2023 = geo_hierarchies[
+        (geo_hierarchies['start_date'] <= '2023-01-01') &
+        (geo_hierarchies['end_date'] >= '2023-01-01')
+    ]
+    
+    merged_data = pd.merge(snap_data, geo_hierarchies_2023, left_on='geography_id', right_on='geography_id')
+    
+    def create_cleaned_df(data, geo_name_map=None, geo_name_prefix=''):
+        df = pd.DataFrame()
+        df['GEO_ID'] = data['geography_id']
+        if geo_name_map:
+            df['GEO_NAME'] = data['geography_id'].map(geo_name_map)
+        elif 'geography_name' in data.columns:
+            df['GEO_NAME'] = data['geography_name']
+        else:
+            df['GEO_NAME'] = ''
+    
+        df['AGI_LOWER_BOUND'] = ''
+        df['AGI_UPPER_BOUND'] = ''
+        df['VALUE'] = data['value']
+        df['IS_COUNT'] = 1
+        df['VARIABLE'] = 'snap_households'
+        return df
+    
+    # National data
+    national_data = merged_data[merged_data['geography_type'] == 'nation'].copy()
+    national_data['geography_name'] = 'US'
+    cleaned_national = create_cleaned_df(national_data)
+    cleaned_national.to_csv(Path(get_data_directory(), 'input', 'benefits', 'cleaned_snap_national.csv'), index=False)
+    
+    # State data
+    state_data = merged_data[merged_data['geography_type'] == 'state-equivalent'].copy()
+    # TODO: fix this redundancy if this becomes permanenent
+    state_fips_map = {
+        '01': 'AL', '02': 'AK', '04': 'AZ', '05': 'AR', '06': 'CA', '08': 'CO', '09': 'CT', '10': 'DE', '11': 'DC',
+        '12': 'FL', '13': 'GA', '15': 'HI', '16': 'ID', '17': 'IL', '18': 'IN', '19': 'IA', '20': 'KS', '21': 'KY',
+        '22': 'LA', '23': 'ME', '24': 'MD', '25': 'MA', '26': 'MI', '27': 'MN', '28': 'MS', '29': 'MO', '30': 'MT',
+        '31': 'NE', '32': 'NV', '33': 'NH', '34': 'NJ', '35': 'NM', '36': 'NY', '37': 'NC', '38': 'ND', '39': 'OH',
+        '40': 'OK', '41': 'OR', '42': 'PA', '44': 'RI', '45': 'SC', '46': 'SD', '47': 'TN', '48': 'TX', '49': 'UT',
+        '50': 'VT', '51': 'VA', '53': 'WA', '54': 'WV', '55': 'WI', '56': 'WY'
+    }
+    state_data['state_fips'] = state_data['geography_id'].str[-2:]
+    state_data['geography_name'] = state_data['state_fips'].map(state_fips_map)
+    cleaned_state = create_cleaned_df(state_data)
+    cleaned_state.to_csv(Path('us-congressional-districts/data/input/benefits/cleaned_snap_state.csv', index=False)
+    cleaned_state.to_csv(Path(get_data_directory(), 'input', 'benefits', 'cleaned_snap_state.csv'), index=False)
+    
+    # District data
+    district_data = merged_data[merged_data['geography_type'] == 'district'].copy()
+    district_data['state_fips'] = district_data['geography_id'].str[9:11]
+    district_data['district_num'] = district_data['geography_id'].str[11:]
+    district_data['geography_name'] = district_data['state_fips'].map(state_fips_map) + ' - District ' + district_data['district_num']
+    cleaned_district = create_cleaned_df(district_data)
+    cleaned_district["VALUE"] = cleaned_district["VALUE"].round().astype(int)
+    cleaned_district.to_csv(Path(get_data_directory(), 'input', 'benefits', 'cleaned_snap_district.csv'), index=False)
+
+
+
+if __name__ == "__main__":
+    process_snap_data(2023)
+
+
+
+def main() -> None:
+    year = 2023
+
+    zip_file = extract_snap_data(2023)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py
new file mode 100644
index 00000000..a728b5ca
--- /dev/null
+++ b/policyengine_us_data/utils/census.py
@@ -0,0 +1,42 @@
+import pathlib
+import requests
+
+import pandas as pd
+import numpy as np
+
+
+def get_census_docs(year):
+    docs_url = (
+        f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json"
+    )
+    # TODO: Alternative: incorporate it!
+    "https://api.census.gov/data/2023/acs/acs1/variables.json"
+
+    docs_response = requests.get(docs_url)
+    docs_response.raise_for_status()
+
+    return docs_response.json()
+
+
+def pull_acs_table(group: str, geo: str, year: int) -> pd.DataFrame:
+    """
+    "group": e.g., 'S2201'
+    "geo": 'National' | 'State' | 'District'
+    "year": e.g., 2023
+    """
+    base = f"https://api.census.gov/data/{year}/acs/acs1"
+    
+    if group[0] == 'S':
+         base = base + "/subject"
+    geo_q = {
+        "National": "us:*",
+        "State": "state:*",
+        "District": "congressional+district:*",
+    }[geo]
+
+    url = f"{base}?get=group({group})&for={geo_q}"
+
+    data = requests.get(url).json()
+    headers, rows = data[0], data[1:]
+    df = pd.DataFrame(rows, columns=headers)
+    return df

From 7b3cacc0186012db08329b89593143aefbdf09c8 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Tue, 12 Aug 2025 13:37:28 -0400
Subject: [PATCH 10/27] got SNAP settled

---
 ...sury_targets.py => etl_irs_soi_targets.py} |   0
 policyengine_us_data/db/etl_snap.py           | 272 ++++++++----------
 policyengine_us_data/db/load_age_targets.py   |  55 ----
 policyengine_us_data/utils/census.py          |  55 ++++
 4 files changed, 173 insertions(+), 209 deletions(-)
 rename policyengine_us_data/db/{load_treasury_targets.py => etl_irs_soi_targets.py} (100%)

diff --git a/policyengine_us_data/db/load_treasury_targets.py b/policyengine_us_data/db/etl_irs_soi_targets.py
similarity index 100%
rename from policyengine_us_data/db/load_treasury_targets.py
rename to policyengine_us_data/db/etl_irs_soi_targets.py
diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py
index a82da744..a0f20133 100644
--- a/policyengine_us_data/db/etl_snap.py
+++ b/policyengine_us_data/db/etl_snap.py
@@ -18,7 +18,8 @@
 )
 from policyengine_us_data.utils.census import (
     get_census_docs,
-    pull_subject_table,
+    pull_acs_table,
+    STATE_NAME_TO_FIPS,
 )
 
 
@@ -76,6 +77,7 @@
     "Wyoming": "56",
 }
 
+# Administrative data ------------------------------------------------
 
 def extract_administrative_snap_data(year=2023):
     """
@@ -120,7 +122,7 @@ def extract_administrative_snap_data(year=2023):
     return zipfile.ZipFile(io.BytesIO(response.content))
 
 
-def transform_snap_administrative_data(zip_file, year):
+def transform_administrative_snap_data(zip_file, year):
     filename = f"FY{str(year)[-2:]}.xlsx"
     with zip_file.open(filename) as f:
         xls = pd.ExcelFile(f)
@@ -180,17 +182,10 @@ def transform_snap_administrative_data(zip_file, year):
     )
     df_states["ucgid_str"] = "0400000US" + df_states["STATE_FIPS"]
 
-    # I don't think I need to make this long, because it's going to be 3 different variables
-    #df_states[['ucgid_str', 'Households']]
-    #df_states[['ucgid_str', 'Persons']]
-    #df_states[['ucgid_str', 'Cost']]
-
     return df_states
 
 
-def load_snap_administrative_data(?, year):
-
-    year = 2023
+def load_administrative_snap_data(df_states, year):
 
     DATABASE_URL = "sqlite:///policy_data.db"
     engine = create_engine(DATABASE_URL)
@@ -207,7 +202,7 @@ def load_snap_administrative_data(?, year):
     nat_stratum.constraints_rel = [
         StratumConstraint(
             constraint_variable="ucgid_str",
-            operation="equals",
+            operation="in",
             value="0100000US",
         ),
         StratumConstraint(
@@ -235,7 +230,7 @@ def load_snap_administrative_data(?, year):
         new_stratum.constraints_rel = [
             StratumConstraint(
                 constraint_variable="ucgid_str",
-                operation="equals",
+                operation="in",
                 value=row["ucgid_str"],
             ),
             StratumConstraint(
@@ -268,170 +263,139 @@ def load_snap_administrative_data(?, year):
         stratum_lookup["State"][row['ucgid_str']] = new_stratum.stratum_id
 
     session.commit()
+    return stratum_lookup
 
 
-
-# Moving  away from administrative data to get the survey data ------
-
-
+# Survey data ------------------------------------------------------
 
 def extract_survey_snap_data(year):
 
-    # Household count data -----
-    data = pull_acs_table("S2201", "National", 2023)
-    data["S2201_C03_001E"]
-
-
-    # Ha, this is off my a factor of 1000, and ACS does not report dollars in 1000s
-    # TODO: try to figure it out.
-    data = pull_acs_table("B19058", "State", 2023)
-    np.sum(data["B19058_001E"].values.astype(int)) / 1E9
+    raw_dfs = {}
+    for geo in ["District", "State", "National"]:
+        df = pull_acs_table("S2201", geo, year)
+        raw_dfs[geo] = df
 
+    return raw_dfs
 
 
-    raw_dfs = {}
-    for geo in ["District", "State", "National"]:
-        df = pull_subject_table(group, geo, year)
-        df_data = df.rename(columns=rename_mapping)[
-            ["GEO_ID", "NAME"] + list(label_to_short_name_mapping.values())
-        ]
-        if geo == "State":
-            raw_dfs["DC"] = df_data[df_data["GEO_ID"].isin(["0400000US11"])]
+def transform_survey_snap_data(raw_dfs):
 
-        # Filter out Puerto Rico
-        df_geos = df_data[
-            ~df_data["GEO_ID"].isin(
-                [
-                    "5001800US7298",
+    dfs = {}
+    for geo in raw_dfs.keys():
+        df = raw_dfs[geo] 
+        dfs[geo] = df_data = df[["GEO_ID", "S2201_C03_001E"]].rename({
+            "GEO_ID": "ucgid_str",
+            "S2201_C03_001E": "snap_household_ct"
+            }, axis=1
+        )[
+            ~df["GEO_ID"].isin(
+                [  # Puerto Rico's state and district
                     "0400000US72",
+                    "5001800US7298",
                 ]
             )
         ].copy()
-        raw_dfs[geo] = df_geos
-        SAVE_DIR = Path(get_data_directory() / "input" / "demographics")
-        df_geos.to_csv(SAVE_DIR / f"raw_snap_{geo}.csv", index=False)
-
-    folder_path = (
-        f"{get_data_directory()}/targets/edition=raw/"
-        f"base_period={year}/reference_period={year}/"
-        f"variable=snap_households/"
-    )
-    raw_out = pd.concat([
-        raw_dfs['National'][['GEO_ID', 'overall']],
-        raw_dfs['State'][['GEO_ID', 'overall']],
-        raw_dfs['DC'][['GEO_ID', 'overall']],
-        raw_dfs['District'][['GEO_ID', 'overall']]
-    ]).rename({"GEO_ID": "geography_id", "overall": "value"}, axis=1)
-        
-    raw_out.to_csv(os.path.join(folder_path, "part-001.csv"), index=False)     
-
-    additive_dfs = enforce_geographic_self_consistency(raw_dfs, 'overall')    
-    usda_snap_df = extract_usda_snap_data()
-    adjusted_dfs = adjust_to_administrative_data(additive_dfs, 'overall', usda_snap_df)
-    assert check_geographic_consistency(adjusted_dfs, 'overall') 
-
-    folder_path = (
-        f"{get_data_directory()}/targets/edition=cleaned/"
-        f"base_period={year}/reference_period={year}/"
-        f"variable=snap_households/"
-    )
- 
-    clean_out = pd.concat([
-        adjusted_dfs['National'][['GEO_ID', 'overall']],
-        adjusted_dfs['State'][['GEO_ID', 'overall']],
-        adjusted_dfs['DC'][['GEO_ID', 'overall']],
-        adjusted_dfs['District'][['GEO_ID', 'overall']]
-    ]).rename({"GEO_ID": "geography_id", "overall": "value"}, axis=1)
- 
-    clean_out.to_csv(os.path.join(folder_path, "part-001.csv"), index=False)
-
-
-def reformat_cleaned_data():
-    """Temporary conversion function"""
-    benefits_dir = Path(get_data_directory() / 'input' / 'benefits')
-   
-    snap_filepath = Path(
-        get_data_directory(),
-        "targets",
-        "edition=cleaned",
-        "base_period=2023",
-        "reference_period=2023",
-        "variable=snap_households",
-        "part-001.csv"
+
+    return dfs
+
+
+def load_survey_snap_data(survey_dfs, year, stratum_lookup ={}):
+    """Use an already defined stratum_lookup to load the survey SNAP data"""
+
+    DATABASE_URL = "sqlite:///policy_data.db"
+    engine = create_engine(DATABASE_URL)
+
+    Session = sessionmaker(bind=engine)
+    session = Session()
+
+    # National. Use the stratum from the administrative data function
+    nat_df = survey_dfs["National"]
+    nat_stratum = session.get(Stratum, stratum_lookup["National"])
+
+    nat_stratum.targets_rel.append(
+        Target(
+            variable="household_count",
+            period=year,
+            value=nat_df["snap_household_ct"],
+            source_id=4,
+            active=True,
+        )
     )
-    snap_data = pd.read_csv(snap_filepath)
-    geo_hierarchies = pd.read_csv(Path(get_data_directory(), 'meta', 'geo_hierarchies.csv'))
-    
-    # Use Type II SCD to Filter geo_hierarchies for the year 2023
-    geo_hierarchies['start_date'] = pd.to_datetime(geo_hierarchies['start_date'])
-    geo_hierarchies['end_date'] = pd.to_datetime(geo_hierarchies['end_date'])
-    geo_hierarchies_2023 = geo_hierarchies[
-        (geo_hierarchies['start_date'] <= '2023-01-01') &
-        (geo_hierarchies['end_date'] >= '2023-01-01')
-    ]
-    
-    merged_data = pd.merge(snap_data, geo_hierarchies_2023, left_on='geography_id', right_on='geography_id')
-    
-    def create_cleaned_df(data, geo_name_map=None, geo_name_prefix=''):
-        df = pd.DataFrame()
-        df['GEO_ID'] = data['geography_id']
-        if geo_name_map:
-            df['GEO_NAME'] = data['geography_id'].map(geo_name_map)
-        elif 'geography_name' in data.columns:
-            df['GEO_NAME'] = data['geography_name']
-        else:
-            df['GEO_NAME'] = ''
-    
-        df['AGI_LOWER_BOUND'] = ''
-        df['AGI_UPPER_BOUND'] = ''
-        df['VALUE'] = data['value']
-        df['IS_COUNT'] = 1
-        df['VARIABLE'] = 'snap_households'
-        return df
-    
-    # National data
-    national_data = merged_data[merged_data['geography_type'] == 'nation'].copy()
-    national_data['geography_name'] = 'US'
-    cleaned_national = create_cleaned_df(national_data)
-    cleaned_national.to_csv(Path(get_data_directory(), 'input', 'benefits', 'cleaned_snap_national.csv'), index=False)
-    
-    # State data
-    state_data = merged_data[merged_data['geography_type'] == 'state-equivalent'].copy()
-    # TODO: fix this redundancy if this becomes permanenent
-    state_fips_map = {
-        '01': 'AL', '02': 'AK', '04': 'AZ', '05': 'AR', '06': 'CA', '08': 'CO', '09': 'CT', '10': 'DE', '11': 'DC',
-        '12': 'FL', '13': 'GA', '15': 'HI', '16': 'ID', '17': 'IL', '18': 'IN', '19': 'IA', '20': 'KS', '21': 'KY',
-        '22': 'LA', '23': 'ME', '24': 'MD', '25': 'MA', '26': 'MI', '27': 'MN', '28': 'MS', '29': 'MO', '30': 'MT',
-        '31': 'NE', '32': 'NV', '33': 'NH', '34': 'NJ', '35': 'NM', '36': 'NY', '37': 'NC', '38': 'ND', '39': 'OH',
-        '40': 'OK', '41': 'OR', '42': 'PA', '44': 'RI', '45': 'SC', '46': 'SD', '47': 'TN', '48': 'TX', '49': 'UT',
-        '50': 'VT', '51': 'VA', '53': 'WA', '54': 'WV', '55': 'WI', '56': 'WY'
-    }
-    state_data['state_fips'] = state_data['geography_id'].str[-2:]
-    state_data['geography_name'] = state_data['state_fips'].map(state_fips_map)
-    cleaned_state = create_cleaned_df(state_data)
-    cleaned_state.to_csv(Path('us-congressional-districts/data/input/benefits/cleaned_snap_state.csv', index=False)
-    cleaned_state.to_csv(Path(get_data_directory(), 'input', 'benefits', 'cleaned_snap_state.csv'), index=False)
-    
-    # District data
-    district_data = merged_data[merged_data['geography_type'] == 'district'].copy()
-    district_data['state_fips'] = district_data['geography_id'].str[9:11]
-    district_data['district_num'] = district_data['geography_id'].str[11:]
-    district_data['geography_name'] = district_data['state_fips'].map(state_fips_map) + ' - District ' + district_data['district_num']
-    cleaned_district = create_cleaned_df(district_data)
-    cleaned_district["VALUE"] = cleaned_district["VALUE"].round().astype(int)
-    cleaned_district.to_csv(Path(get_data_directory(), 'input', 'benefits', 'cleaned_snap_district.csv'), index=False)
+    session.add(nat_stratum)
+    session.flush()
 
+    # Skipping state for now, but 
+    # # State. Also use the stratum from the administrative data function
+    # state_df = survey_dfs["State"]
+    # for _, row in state_df.iterrows():
+    #     print(row)
+    #     state_stratum = session.get(Stratum, stratum_lookup["State"][row["ucgid_str"]])
+
+    #     state_stratum.targets_rel.append(
+    #         Target(
+    #             variable="household_count",
+    #             period=year,
+    #             value=row["snap_household_ct"],
+    #             source_id=4,
+    #             active=True,
+    #         )
+    #     )
+    #     session.add(state_stratum)
+    #     session.flush()
+
+    # You will need to create new strata for districts
+    district_df = survey_dfs["District"]
+    for _, row in district_df.iterrows():
+        note = f"Geo: {row['ucgid_str']} Received SNAP Benefits"
+        state_ucgid_str = '0400000US' + row['ucgid_str'][9:11]
+        state_stratum_id = stratum_lookup['State'][state_ucgid_str]
+        new_stratum = Stratum(
+            parent_stratum_id=state_stratum_id, stratum_group_id=0, notes=note
+        )
 
+        new_stratum.constraints_rel = [
+            StratumConstraint(
+                constraint_variable="ucgid_str",
+                operation="in",
+                value=row["ucgid_str"],
+            ),
+            StratumConstraint(
+                constraint_variable="snap",
+                operation="greater_than",
+                value='0',
+            ),
+        ]
+        new_stratum.targets_rel.append(
+            Target(
+                variable="household_count",
+                period=year,
+                value=row["snap_household_ct"],
+                source_id=4,
+                active=True,
+            )
+        )
+        session.add(new_stratum)
+        session.flush()
 
-if __name__ == "__main__":
-    process_snap_data(2023)
+    session.commit()
 
+    return stratum_lookup
 
 
-def main() -> None:
+def main():
     year = 2023
 
-    zip_file = extract_snap_data(2023)
+    # Extract ---------
+    zip_file_admin = extract_administrative_snap_data()
+    raw_survey_dfs = extract_survey_snap_data(year)
+
+    # Transform -------
+    state_admin_df = transform_administrative_snap_data(zip_file_admin, year)
+    survey_dfs = transform_survey_snap_data(raw_survey_dfs)
+
+    # Load -----------
+    stratum_lookup = load_administrative_snap_data(state_admin_df, year)
+    load_survey_snap_data(survey_dfs, year, stratum_lookup)
 
 
 if __name__ == "__main__":
diff --git a/policyengine_us_data/db/load_age_targets.py b/policyengine_us_data/db/load_age_targets.py
index f42adcf3..f5142e17 100644
--- a/policyengine_us_data/db/load_age_targets.py
+++ b/policyengine_us_data/db/load_age_targets.py
@@ -17,61 +17,6 @@
 
 logger = logging.getLogger(__name__)
 
-STATE_NAME_TO_ABBREV = {
-    "Alabama": "AL",
-    "Alabama": "AL",
-    "Alaska": "AK",
-    "Arizona": "AZ",
-    "Arkansas": "AR",
-    "California": "CA",
-    "Colorado": "CO",
-    "Connecticut": "CT",
-    "Delaware": "DE",
-    "District of Columbia": "DC",
-    "Florida": "FL",
-    "Georgia": "GA",
-    "Hawaii": "HI",
-    "Idaho": "ID",
-    "Illinois": "IL",
-    "Indiana": "IN",
-    "Iowa": "IA",
-    "Kansas": "KS",
-    "Kentucky": "KY",
-    "Louisiana": "LA",
-    "Maine": "ME",
-    "Maryland": "MD",
-    "Massachusetts": "MA",
-    "Michigan": "MI",
-    "Minnesota": "MN",
-    "Mississippi": "MS",
-    "Missouri": "MO",
-    "Montana": "MT",
-    "Nebraska": "NE",
-    "Nevada": "NV",
-    "New Hampshire": "NH",
-    "New Jersey": "NJ",
-    "New Mexico": "NM",
-    "New York": "NY",
-    "North Carolina": "NC",
-    "North Dakota": "ND",
-    "Ohio": "OH",
-    "Oklahoma": "OK",
-    "Oregon": "OR",
-    "Pennsylvania": "PA",
-    "Rhode Island": "RI",
-    "South Carolina": "SC",
-    "South Dakota": "SD",
-    "Tennessee": "TN",
-    "Texas": "TX",
-    "Utah": "UT",
-    "Vermont": "VT",
-    "Virginia": "VA",
-    "Washington": "WA",
-    "West Virginia": "WV",
-    "Wisconsin": "WI",
-    "Wyoming": "WY",
-}
-
 
 LABEL_TO_SHORT = {
     "Estimate!!Total!!Total population!!AGE!!Under 5 years": "0-4",
diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py
index a728b5ca..69a475fb 100644
--- a/policyengine_us_data/utils/census.py
+++ b/policyengine_us_data/utils/census.py
@@ -5,6 +5,61 @@
 import numpy as np
 
 
+STATE_NAME_TO_FIPS = {
+    "Alabama": "01",
+    "Alaska": "02",
+    "Arizona": "04",
+    "Arkansas": "05",
+    "California": "06",
+    "Colorado": "08",
+    "Connecticut": "09",
+    "Delaware": "10",
+    "District of Columbia": "11",
+    "Florida": "12",
+    "Georgia": "13",
+    "Hawaii": "15",
+    "Idaho": "16",
+    "Illinois": "17",
+    "Indiana": "18",
+    "Iowa": "19",
+    "Kansas": "20",
+    "Kentucky": "21",
+    "Louisiana": "22",
+    "Maine": "23",
+    "Maryland": "24",
+    "Massachusetts": "25",
+    "Michigan": "26",
+    "Minnesota": "27",
+    "Mississippi": "28",
+    "Missouri": "29",
+    "Montana": "30",
+    "Nebraska": "31",
+    "Nevada": "32",
+    "New Hampshire": "33",
+    "New Jersey": "34",
+    "New Mexico": "35",
+    "New York": "36",
+    "North Carolina": "37",
+    "North Dakota": "38",
+    "Ohio": "39",
+    "Oklahoma": "40",
+    "Oregon": "41",
+    "Pennsylvania": "42",
+    "Rhode Island": "44",
+    "South Carolina": "45",
+    "South Dakota": "46",
+    "Tennessee": "47",
+    "Texas": "48",
+    "Utah": "49",
+    "Vermont": "50",
+    "Virginia": "51",
+    "Washington": "53",
+    "West Virginia": "54",
+    "Wisconsin": "55",
+    "Wyoming": "56",
+}
+
+
 def get_census_docs(year):
     docs_url = (
         f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json"

From e45072e9cd4dc5b30fcb81714bda359a753b05fc Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Tue, 12 Aug 2025 16:50:14 -0400
Subject: [PATCH 11/27] progress

---
 .../db/{load_age_targets.py => etl_age.py}    |   4 -
 ...tl_irs_soi_targets.py => etl_eitc_only.py} |   0
 policyengine_us_data/db/etl_irs_soi.py        | 419 +++++++++++
 policyengine_us_data/db/load_soi_targets.py   | 672 ------------------
 4 files changed, 419 insertions(+), 676 deletions(-)
 rename policyengine_us_data/db/{load_age_targets.py => etl_age.py} (99%)
 rename policyengine_us_data/db/{etl_irs_soi_targets.py => etl_eitc_only.py} (100%)
 create mode 100644 policyengine_us_data/db/etl_irs_soi.py
 delete mode 100644 policyengine_us_data/db/load_soi_targets.py

diff --git a/policyengine_us_data/db/load_age_targets.py b/policyengine_us_data/db/etl_age.py
similarity index 99%
rename from policyengine_us_data/db/load_age_targets.py
rename to policyengine_us_data/db/etl_age.py
index f5142e17..e168317b 100644
--- a/policyengine_us_data/db/load_age_targets.py
+++ b/policyengine_us_data/db/etl_age.py
@@ -1,4 +1,3 @@
-import logging
 import requests
 from pathlib import Path
 import io
@@ -15,9 +14,6 @@
 )
 
 
-logger = logging.getLogger(__name__)
-
-
 LABEL_TO_SHORT = {
     "Estimate!!Total!!Total population!!AGE!!Under 5 years": "0-4",
     "Estimate!!Total!!Total population!!AGE!!5 to 9 years": "5-9",
diff --git a/policyengine_us_data/db/etl_irs_soi_targets.py b/policyengine_us_data/db/etl_eitc_only.py
similarity index 100%
rename from policyengine_us_data/db/etl_irs_soi_targets.py
rename to policyengine_us_data/db/etl_eitc_only.py
diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
new file mode 100644
index 00000000..d9eeb503
--- /dev/null
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -0,0 +1,419 @@
+from pathlib import Path
+from typing import List, Optional, Sequence, Dict, Tuple, Any, Union
+
+import numpy as np
+import pandas as pd
+
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+
+from policyengine_us_data.db.create_database_tables import (
+    Stratum,
+    StratumConstraint,
+    Target,
+)
+
+
+
+
+"""Utilities to pull AGI targets from the IRS SOI data files."""
+
+# Congressional districts have one fewer level than the national and state
+# They're missing the million plus category
+#  ("No AGI Stub") is a specific, intentional category used by the IRS in its summary data files.
+#
+#SOI_COLUMNS = [
+#    "Under $1",
+#    "$1 under $10,000",
+#    "$10,000 under $25,000",
+#    "$25,000 under $50,000",
+#    "$50,000 under $75,000",
+#    "$75,000 under $100,000",
+#    "$100,000 under $200,000",
+#    "$200,000 under $500,000",
+#    "$500,000 or more",
+#]
+#
+#AGI_STUB_TO_BAND = {i + 1: band for i, band in enumerate(SOI_COLUMNS)}
+#
+#AGI_BOUNDS = {
+#    "Under $1": (-np.inf, 1),
+#    "$1 under $10,000": (1, 10_000),
+#    "$10,000 under $25,000": (10_000, 25_000),
+#    "$25,000 under $50,000": (25_000, 50_000),
+#    "$50,000 under $75,000": (50_000, 75_000),
+#    "$75,000 under $100,000": (75_000, 100_000),
+#    "$100,000 under $200,000": (100_000, 200_000),
+#    "$200,000 under $500,000": (200_000, 500_000),
+#    "$500,000 or more": (500_000, np.inf),
+#}
+#
+##NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"}
+#
+IGNORE_GEO_IDS = {
+    "0400000US72",  # Puerto Rico (state level)
+    "5001800US7298",  # Puerto Rico
+    "5001800US6098",  # American Samoa
+    "5001800US6698",  # Guam
+    "5001800US6998",  # Northern Mariana Islands
+    "5001800US7898",  # U.S. Virgin Islands
+}
+
+
+def create_records(df, breakdown_variable, target_variable):
+    """Transforms a DataFrame subset into a standardized list of records."""
+    temp_df = df[["ucgid_str"]].copy()
+    temp_df["breakdown_variable"] = breakdown_variable 
+    temp_df["breakdown_value"] = df[breakdown_variable]
+    temp_df["target_variable"] = target_variable 
+    temp_df["target_value"] = df[target_variable]
+    return temp_df
+
+
+def make_records(
+    df: pd.DataFrame,
+    *,
+    count_col: str,
+    amount_col: str,
+    amount_name: str,
+    breakdown_col: Optional[str] = None,
+    multiplier: int = 1_000,
+):
+    df = (
+        df.rename({count_col: "tax_unit_count",
+                   amount_col: amount_name},
+                  axis=1)
+          .copy()
+    )
+
+    if breakdown_col is None:
+        breakdown_col = "one"
+        df[breakdown_col] = 1
+
+    rec_counts  = create_records(df, breakdown_col, "tax_unit_count")
+    rec_amounts = create_records(df, breakdown_col, amount_name)
+    rec_amounts["target_value"] *= multiplier  # Only the amounts get * 1000
+    rec_counts["target_variable"] = f"{amount_name}_tax_unit_count"
+
+    return rec_counts, rec_amounts
+
+
+def make_agi_long(df: pd.DataFrame) -> pd.DataFrame:
+    """Convert IRS SOI AGI‑split table from wide to the long format used"""
+    target_col_map = {
+        "N1":     "agi_tax_unit_count",
+        "N2":     "agi_person_count",
+        "A00100": "agi_total_amount",
+    }
+    work = (
+        df[["ucgid_str", "agi_stub"] + list(target_col_map)]
+          .rename(columns=target_col_map)
+    )
+    long = (
+        work.melt(
+            id_vars=["ucgid_str", "agi_stub"],
+            var_name="target_variable",
+            value_name="target_value"
+        )
+        .rename(columns={"agi_stub": "breakdown_value"})
+        .assign(breakdown_variable="agi_stub")
+    )
+    long = long[["ucgid_str",
+                 "breakdown_variable",
+                 "breakdown_value",
+                 "target_variable",
+                 "target_value"]]
+    return (
+        long.sort_values(["ucgid_str", "breakdown_value", "target_variable"])
+            .reset_index(drop=True)
+    )
+
+
+def extract_soi_data() -> pd.DataFrame:
+    """Download and save congressional district AGI totals.
+
+    In the file below, "22" is 2022, "in" is individual returns,
+    "cd" is congressional districts
+    """
+    return pd.read_csv("https://www.irs.gov/pub/irs-soi/22incd.csv")
+
+
+def transform_soi_data(raw_df):
+
+    TARGETS = [
+        dict(code="59661", name="eitc", breakdown=("eitc_children", 0)),
+        dict(code="59662", name="eitc", breakdown=("eitc_children", 1)),
+        dict(code="59663", name="eitc", breakdown=("eitc_children", 2)),
+        dict(code="59664", name="eitc", breakdown=("eitc_children", "3+")),
+        dict(code="59664", name="qbid", breakdown=None),
+        dict(code="18500", name="real_estate_taxes", breakdown=None),
+        dict(code="01000", name="net_capital_gain", breakdown=None),
+        dict(code="03150", name="ira_payments", breakdown=None),
+        dict(code="00300", name="taxable_interest", breakdown=None),
+        dict(code="00400", name="tax_exempt_interest", breakdown=None),
+        dict(code="00600", name="oridinary_dividends", breakdown=None),
+        dict(code="00650", name="qualified_dividends", breakdown=None),
+        dict(code="26270", name="partnership_and_s_crop_net_income", breakdown=None),
+        dict(code="02500", name="total_social_security", breakdown=None),
+        dict(code="01700", name="pension_and_annuities", breakdown=None),
+        dict(code="02300", name="unemployment_compensation", breakdown=None),
+        dict(code="00900", name="business_net_income", breakdown=None),
+        dict(code="17000", name="medical_and_dental_deduction", breakdown=None),
+        dict(code="00700", name="salt_refunds", breakdown=None),
+        dict(code="18425", name="salt_amount", breakdown=None),
+        dict(code="06500", name="income_tax", breakdown=None),
+    ]
+
+    # National ---------------
+    national_df = raw_df.copy().loc[
+        (raw_df.STATE == "US")
+    ]
+    national_df["ucgid_str"] = "0100000US"
+
+    # State -------------------
+    # You've got agi_stub == 0 in here, which you want to use any time you don't want to
+    # break things up by AGI
+    state_df = raw_df.copy().loc[
+        (raw_df.STATE != "US") &
+        (raw_df.CONG_DISTRICT == 0)
+    ]
+    state_df["ucgid_str"] = "0400000US" + state_df["STATEFIPS"].astype(str).str.zfill(2)
+
+    # District ------------------
+    # This is going to fail because we're missing the single cong district states
+    district_df = raw_df.copy().loc[
+        (raw_df.CONG_DISTRICT > 0)
+    ]
+
+    max_cong_district_by_state = raw_df.groupby('STATE')['CONG_DISTRICT'].transform('max')
+    district_df = raw_df.copy().loc[
+        (raw_df['CONG_DISTRICT'] > 0) | (max_cong_district_by_state == 0)
+    ]
+    district_df = district_df.loc[district_df['STATE'] != 'US']
+    district_df["STATEFIPS"] = district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2)
+    district_df["CONG_DISTRICT"] = (
+        district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2)
+    )
+    district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"]
+    district_df = district_df[~district_df["ucgid_str"].isin(IGNORE_GEO_IDS)]
+
+    assert district_df.shape[0] % 436 == 0
+
+    all_df = pd.concat([national_df, state_df, district_df])
+
+    # "Marginal" over AGI bands, which this data set is organized according to 
+    all_marginals = all_df.copy().loc[all_df.agi_stub == 0]
+    assert all_marginals.shape[0] == 436 + 51 + 1
+
+    # Collect targets from the SOI file
+    records = []
+    for spec in TARGETS:
+        count_col  = f"N{spec['code']}"  # e.g. 'N59661'
+        amount_col = f"A{spec['code']}"  # e.g. 'A59661'
+    
+        df = all_marginals.copy()
+    
+        if spec["breakdown"] is not None:
+            col, val = spec["breakdown"]
+            df[col] = val
+            breakdown_col = col
+        else:
+            breakdown_col = None
+    
+        rec_counts, rec_amounts = make_records(
+            df,
+            count_col   = count_col,
+            amount_col  = amount_col,
+            amount_name = spec["name"],
+            breakdown_col = breakdown_col,
+            multiplier  = 1_000,
+        )
+        records.extend([rec_counts, rec_amounts])
+
+
+    # AGI Processing (separate, doesn't have a count column)
+    temp_df = df[["ucgid_str"]].copy()
+    temp_df["breakdown_variable"] = "one" 
+    temp_df["breakdown_value"] = 1 
+    temp_df["target_variable"] = "agi"
+    temp_df["target_value"] = df["A00100"] * 1_000
+
+    records.append(temp_df)
+
+    # Note: national counts only have agi_stub = 0
+    all_agi_splits = all_df.copy().loc[all_df.agi_stub != 0]
+    assert all_agi_splits.shape[0] % (436 + 51 + 0) == 0
+
+    agi_long = make_agi_long(all_agi_splits)
+    agi_long = agi_long.loc[agi_long.target_variable != "agi_total_amount"] 
+
+    records.append(agi_long)
+
+    return pd.concat(records)
+
+
+def load_soi_data(long_dfs, year):
+
+    DATABASE_URL = "sqlite:///policy_data.db"
+    engine = create_engine(DATABASE_URL)
+
+    Session = sessionmaker(bind=engine)
+    session = Session()
+
+    # Load EITC data -------------------------------------------------------- 
+    # NOTE: obviously this is not especially robust ---
+    eitc_data = {'0': (long_dfs[0], long_dfs[1]),
+                 '1': (long_dfs[2], long_dfs[3]),
+                 '2': (long_dfs[4], long_dfs[5]),
+                 '3+': (long_dfs[6], long_dfs[7])}
+
+    stratum_lookup = {"State": {}, "District": {}}
+    for n_children in eitc_data.keys():
+        eitc_count_i, eitc_amount_i = eitc_data[n_children]
+        for i in range(eitc_count_i.shape[0]):
+            ucgid_i = eitc_count_i[['ucgid_str']].iloc[i].values[0]
+            note = f"Geo: {ucgid_i}, EITC received with {n_children} children"
+
+            if len(ucgid_i) == 9:  # National.
+                new_stratum = Stratum(
+                    parent_stratum_id=None, stratum_group_id=0, notes=note
+                )
+            elif len(ucgid_i) == 11:  # State 
+                new_stratum = Stratum(
+                    parent_stratum_id=stratum_lookup["National"],
+                    stratum_group_id=0,
+                    notes=note
+                )
+            elif len(ucgid_i) == 13:  # District 
+                new_stratum = Stratum(
+                    parent_stratum_id=stratum_lookup["State"]['0400000US' + ucgid_i[9:11]],
+                    stratum_group_id=0,
+                    notes=note
+                )
+
+            new_stratum.constraints_rel = [
+               StratumConstraint(
+                   constraint_variable="ucgid_str",
+                   operation="in",
+                   value=ucgid_i,
+               ),
+            ]
+            if n_children == "3+":
+               new_stratum.constraints_rel.append(
+                   StratumConstraint(
+                       constraint_variable="eitc_children",
+                       operation="greater_than_or_equal_to",
+                       value='3',
+                   )
+               )
+            else:
+               new_stratum.constraints_rel.append(
+                   StratumConstraint(
+                       constraint_variable="eitc_children",
+                       operation="equals",
+                       value=f'{n_children}',
+                   )
+               )
+
+            new_stratum.targets_rel = [
+                Target(
+                    variable="tax_unit_count",
+                    period=year,
+                    value=eitc_count_i.iloc[i][["target_value"]].values[0],
+                    source_id=5,
+                    active=True,
+                ),
+                Target(
+                    variable="eitc",
+                    period=year,
+                    value=eitc_amount_i.iloc[i][["target_value"]].values[0],
+                    source_id=5,
+                    active=True,
+                )
+            ]
+
+            session.add(new_stratum)
+            session.flush()
+
+            if len(ucgid_i) == 9:
+                 stratum_lookup["National"] = new_stratum.stratum_id
+            elif len(ucgid_i) == 11: 
+                 stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id
+
+
+    # No breakdown variables in this set 
+    for j in range(8, 42, 2):
+        print(long_dfs[j])  # count
+        print(long_dfs[j + 1])  # amount
+
+        # Why are we making strata here? You have a lot of these to run through
+        count_j, amount_j = long_dfs[j], long_dfs[j + 1] 
+        for i in range(count_j.shape[0]):
+            ucgid_i = count_j[['ucgid_str']].iloc[i].values[0]
+            # If there's no breakdown variable, is this a new geo?
+            # The problem is, it's vary difficult to search for a geography
+            # That's already in existance
+            note = f"Geo: {ucgid_i}"
+
+            if len(ucgid_i) == 9:  # National.
+                new_stratum = Stratum(
+                    parent_stratum_id=None, stratum_group_id=0, notes=note
+                )
+            elif len(ucgid_i) == 11:  # State 
+                new_stratum = Stratum(
+                    parent_stratum_id=stratum_lookup["National"],
+                    stratum_group_id=0,
+                    notes=note
+                )
+            elif len(ucgid_i) == 13:  # District 
+                new_stratum = Stratum(
+                    parent_stratum_id=stratum_lookup["State"]['0400000US' + ucgid_i[9:11]],
+                    stratum_group_id=0,
+                    notes=note
+                )
+
+            new_stratum.constraints_rel = [
+               StratumConstraint(
+                   constraint_variable="ucgid_str",
+                   operation="in",
+                   value=ucgid_i,
+               ),
+            ]
+            new_stratum.targets_rel = [
+                Target(
+                    variable="tax_unit_count",
+                    period=year,
+                    value=count_j.iloc[i][["target_value"]].values[0],
+                    source_id=5,
+                    active=True,
+                ),
+                Target(
+                    variable=amount_j.iloc[0][["target_variable"]].values[0],
+                    period=year,
+                    value=amount_j.iloc[i][["target_value"]].values[0],
+                    source_id=5,
+                    active=True,
+                )
+            ]
+
+            session.add(new_stratum)
+            session.flush()
+
+            if len(ucgid_i) == 9:
+                 stratum_lookup["National"] = new_stratum.stratum_id
+            elif len(ucgid_i) == 11: 
+                 stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id
+
+    session.commit()
+
+
+
+def main() -> None:
+    year = 2022  # NOTE: predates the finalization of the 2020 Census redistricting
+    raw_df = extract_soi_data()
+
+    long_dfs = transform_soi_data(raw_df):
+
+
+if __name__ == "__main__":
+    main()
diff --git a/policyengine_us_data/db/load_soi_targets.py b/policyengine_us_data/db/load_soi_targets.py
deleted file mode 100644
index 2fe3fa91..00000000
--- a/policyengine_us_data/db/load_soi_targets.py
+++ /dev/null
@@ -1,672 +0,0 @@
-# This is the file where we actually get the SOI information that we want:
-
-# Goal: start with raw AGI and EITC:
-# Data Dictionary: https://www.irs.gov/pub/irs-soi/22incddocguide.docx
-# The Data: https://www.irs.gov/pub/irs-soi/22incd.csv
-
-from pathlib import Path
-from typing import List, Optional, Sequence, Dict, Tuple, Any, Union
-
-import numpy as np
-import pandas as pd
-import logging
-
-from policyengine_us_data.storage import CALIBRATION_FOLDER
-
-logger = logging.getLogger(__name__)
-
-"""Utilities to pull AGI targets from the IRS SOI data files."""
-
-# Congressional districts have one fewer level than the national and state
-# They're missing the million plus category
-#  ("No AGI Stub") is a specific, intentional category used by the IRS in its summary data files.
-SOI_COLUMNS = [
-    "Under $1",
-    "$1 under $10,000",
-    "$10,000 under $25,000",
-    "$25,000 under $50,000",
-    "$50,000 under $75,000",
-    "$75,000 under $100,000",
-    "$100,000 under $200,000",
-    "$200,000 under $500,000",
-    "$500,000 or more",
-]
-
-AGI_STUB_TO_BAND = {i + 1: band for i, band in enumerate(SOI_COLUMNS)}
-
-AGI_BOUNDS = {
-    "Under $1": (-np.inf, 1),
-    "$1 under $10,000": (1, 10_000),
-    "$10,000 under $25,000": (10_000, 25_000),
-    "$25,000 under $50,000": (25_000, 50_000),
-    "$50,000 under $75,000": (50_000, 75_000),
-    "$75,000 under $100,000": (75_000, 100_000),
-    "$100,000 under $200,000": (100_000, 200_000),
-    "$200,000 under $500,000": (200_000, 500_000),
-    "$500,000 or more": (500_000, np.inf),
-}
-
-#NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"}
-
-IGNORE_GEO_IDS = {
-    "0400000US72",  # Puerto Rico (state level)
-    "5001800US7298",  # Puerto Rico
-    "5001800US6098",  # American Samoa
-    "5001800US6698",  # Guam
-    "5001800US6998",  # Northern Mariana Islands
-    "5001800US7898",  # U.S. Virgin Islands
-}
-
-# after skipping the first 7 rows, the national SOI file has targets as row indices [COUNT_INDEX, AMOUNT_INDEX]
-NATIONAL_VARIABLES = {
-    "adjusted_gross_income": [0, 17],
-}
-
-# the state and district SOI file have targets as column names [COUNT_COL_NAME, AMOUNT_COL_NAME]
-GEOGRAPHY_VARIABLES = {"adjusted_gross_income": ["N1", "A00100"]}
-
-STATE_ABBR_TO_FIPS = {
-    "AL": "01",
-    "AK": "02",
-    "AZ": "04",
-    "AR": "05",
-    "CA": "06",
-    "CO": "08",
-    "CT": "09",
-    "DC": "11",
-    "DE": "10",
-    "FL": "12",
-    "GA": "13",
-    "HI": "15",
-    "ID": "16",
-    "IL": "17",
-    "IN": "18",
-    "IA": "19",
-    "KS": "20",
-    "KY": "21",
-    "LA": "22",
-    "ME": "23",
-    "MD": "24",
-    "MA": "25",
-    "MI": "26",
-    "MN": "27",
-    "MS": "28",
-    "MO": "29",
-    "MT": "30",
-    "NE": "31",
-    "NV": "32",
-    "NH": "33",
-    "NJ": "34",
-    "NM": "35",
-    "NY": "36",
-    "NC": "37",
-    "ND": "38",
-    "OH": "39",
-    "OK": "40",
-    "OR": "41",
-    "PA": "42",
-    "RI": "44",
-    "SC": "45",
-    "SD": "46",
-    "TN": "47",
-    "TX": "48",
-    "UT": "49",
-    "VT": "50",
-    "VA": "51",
-    "WA": "53",
-    "WV": "54",
-    "WI": "55",
-    "WY": "56",
-}
-FIPS_TO_STATE_ABBR = {v: k for k, v in STATE_ABBR_TO_FIPS.items()}
-
-
-#def pull_national_soi_variable(
-#    soi_variable_ident: int,  # the national SOI xlsx file has a row for each target variable
-#    variable_name: Union[str, None],
-#    is_count: bool,
-#    national_df: Optional[pd.DataFrame] = None,
-#) -> pd.DataFrame:
-#    """Download and save national AGI totals."""
-#    df = pd.read_excel(
-#        "https://www.irs.gov/pub/irs-soi/22in54us.xlsx", skiprows=7
-#    )
-#
-#    assert (
-#        np.abs(
-#            df.iloc[soi_variable_ident, 1]
-#            - df.iloc[soi_variable_ident, 2:12].sum()
-#        )
-#        < 100
-#    ), "Row 0 doesn't add up — check the file."
-#
-#    agi_values = df.iloc[soi_variable_ident, 2:12].astype(int).to_numpy()
-#    agi_values = np.concatenate(
-#        [agi_values[:8], [agi_values[8] + agi_values[9]]]
-#    )
-#
-#    agi_brackets = [
-#        AGI_STUB_TO_BAND[i] for i in range(1, len(SOI_COLUMNS) + 1)
-#    ]
-#
-#    result = pd.DataFrame(
-#        {
-#            "GEO_ID": ["0100000US"] * len(agi_brackets),
-#            "GEO_NAME": ["national"] * len(agi_brackets),
-#            "LOWER_BOUND": [AGI_BOUNDS[b][0] for b in agi_brackets],
-#            "UPPER_BOUND": [AGI_BOUNDS[b][1] for b in agi_brackets],
-#            "VALUE": agi_values,
-#        }
-#    )
-#
-#    # final column order
-#    result = result[
-#        ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]
-#    ]
-#    result["IS_COUNT"] = int(is_count)
-#    result["VARIABLE"] = variable_name
-#
-#    result["VALUE"] = np.where(
-#        result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"]
-#    )
-#
-#    if national_df is not None:
-#        # If a DataFrame is passed, we append the new data to it.
-#        df = pd.concat([national_df, result], ignore_index=True)
-#        return df
-#
-#    return result
-#
-#
-#def pull_state_soi_variable(
-#    soi_variable_ident: str,  # the state SOI csv file has a column for each target variable
-#    variable_name: Union[str, None],
-#    is_count: bool,
-#    state_df: Optional[pd.DataFrame] = None,
-#) -> pd.DataFrame:
-#    """Download and save state AGI totals."""
-#    df = pd.read_csv(
-#        "https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv", thousands=","
-#    )
-#
-#    merged = (
-#        df[df["AGI_STUB"].isin([9, 10])]
-#        .groupby("STATE", as_index=False)
-#        .agg({soi_variable_ident: "sum"})
-#        .assign(AGI_STUB=9)
-#    )
-#    df = df[~df["AGI_STUB"].isin([9, 10])]
-#    df = pd.concat([df, merged], ignore_index=True)
-#    df = df[df["AGI_STUB"] != 0]
-#
-#    df["agi_bracket"] = df["AGI_STUB"].map(AGI_STUB_TO_BAND)
-#
-#    df["state_abbr"] = df["STATE"]
-#    df["GEO_ID"] = "0400000US" + df["state_abbr"].map(STATE_ABBR_TO_FIPS)
-#    df["GEO_NAME"] = "state_" + df["state_abbr"]
-#
-#    result = df.loc[
-#        ~df["STATE"].isin(NON_VOTING_STATES.union({"US"})),
-#        ["GEO_ID", "GEO_NAME", "agi_bracket", soi_variable_ident],
-#    ].rename(columns={soi_variable_ident: "VALUE"})
-#
-#    result["LOWER_BOUND"] = result["agi_bracket"].map(
-#        lambda b: AGI_BOUNDS[b][0]
-#    )
-#    result["UPPER_BOUND"] = result["agi_bracket"].map(
-#        lambda b: AGI_BOUNDS[b][1]
-#    )
-#
-#    # final column order
-#    result = result[
-#        ["GEO_ID", "GEO_NAME", "LOWER_BOUND", "UPPER_BOUND", "VALUE"]
-#    ]
-#    result["IS_COUNT"] = int(is_count)
-#    result["VARIABLE"] = variable_name
-#
-#    result["VALUE"] = np.where(
-#        result["IS_COUNT"] == 0, result["VALUE"] * 1_000, result["VALUE"]
-#    )
-#
-#    if state_df is not None:
-#        # If a DataFrame is passed, we append the new data to it.
-#        df = pd.concat([state_df, result], ignore_index=True)
-#        return df
-#
-#    return result
-
-def create_records(df, breakdown_variable, target_variable):
-    """Transforms a DataFrame subset into a standardized list of records."""
-    temp_df = df[["ucgid_str"]].copy()
-    temp_df["breakdown_variable"] = breakdown_variable 
-    temp_df["breakdown_value"] = df[breakdown_variable]
-    temp_df["target_variable"] = target_variable 
-    temp_df["target_value"] = df[target_variable]
-    return temp_df
-
-
-def make_records(
-    df: pd.DataFrame,
-    *,
-    count_col: str,
-    amount_col: str,
-    amount_name: str,
-    breakdown_col: Optional[str] = None,
-    multiplier: int = 1_000,
-):
-    df = (
-        df.rename({count_col: "tax_unit_count",
-                   amount_col: amount_name},
-                  axis=1)
-          .copy()
-    )
-
-    if breakdown_col is None:
-        breakdown_col = "one"
-        df[breakdown_col] = 1
-
-    rec_counts  = create_records(df, breakdown_col, "tax_unit_count")
-    rec_amounts = create_records(df, breakdown_col, amount_name)
-    rec_amounts["target_value"] *= multiplier  # Only the amounts get * 1000
-    rec_counts["target_variable"] = f"{amount_name}_tax_unit_count"
-
-    return rec_counts, rec_amounts
-
-
-
-_TARGET_COL_MAP = {
-    "N1":     "agi_tax_unit_count",   # number of returns (≈ “tax units”)
-    "N2":     "agi_person_count",     # number of individuals
-    "A00100": "agi_total_amount",     # total Adjusted Gross Income
-}
-
-_BREAKDOWN_FIELD = "agi_stub"        # numeric AGI stub 1‑10 from IRS
-_BREAKDOWN_NAME  = "agi_stub"        # what will go in `breakdown_variable`
-
-def make_agi_long(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Convert IRS SOI AGI‑split table from wide to the long format used
-    in your `records[*]` list.
-    
-    Parameters
-    ----------
-    df : DataFrame
-        Must contain `ucgid_str`, `agi_stub` and the three IRS fields
-        in `_TARGET_COL_MAP` (N1, N2, A00100).
-    
-    Returns
-    -------
-    DataFrame with columns:
-        ucgid_str
-        breakdown_variable   (always "agi_stub")
-        breakdown_value      (1‑10)
-        target_variable      ("agi_tax_unit_count" | "agi_person_count" | "agi_total_amount")
-        target_value         (float)
-    """
-    # — keep only what we need and rename for clarity
-    work = (
-        df[["ucgid_str", _BREAKDOWN_FIELD] + list(_TARGET_COL_MAP)]
-          .rename(columns=_TARGET_COL_MAP)     # N1 → agi_tax_unit_count, etc.
-    )
-
-    # — wide → long
-    long = (
-        work.melt(
-            id_vars=["ucgid_str", _BREAKDOWN_FIELD],
-            var_name="target_variable",
-            value_name="target_value"
-        )
-        .rename(columns={_BREAKDOWN_FIELD: "breakdown_value"})
-        .assign(breakdown_variable=_BREAKDOWN_NAME)
-        # Optional: add a human‑readable band label if useful
-        # .assign(breakdown_label=lambda d: d["breakdown_value"].map(AGI_STUB_TO_BAND))
-    )
-
-    # — final column order
-    long = long[["ucgid_str",
-                 "breakdown_variable",
-                 "breakdown_value",
-                 "target_variable",
-                 "target_value"]]
-
-    # consistently sort (purely cosmetic)
-    return (
-        long.sort_values(["ucgid_str", "breakdown_value", "target_variable"])
-            .reset_index(drop=True)
-    )
-
-
-def extract_soi_data() -> pd.DataFrame:
-    """Download and save congressional district AGI totals.
-
-    In the file below, "22" is 2022, "in" is individual returns,
-    "cd" is congressional districts
-    """
-    return pd.read_csv("https://www.irs.gov/pub/irs-soi/22incd.csv")
-
-
-raw_df = extract_soi_data()
-# a "stub" is a term the IRS uses for a predefined category or group, specifically an income bracket.
-
-TARGETS = [
-    dict(code="59661", name="eitc", breakdown=("eitc_children", 0)),
-    dict(code="59662", name="eitc", breakdown=("eitc_children", 1)),
-    dict(code="59663", name="eitc", breakdown=("eitc_children", 2)),
-    dict(code="59664", name="eitc", breakdown=("eitc_children", "3+")),
-    dict(code="59664", name="qbid", breakdown=None),
-    dict(code="18500", name="real_estate_taxes", breakdown=None),
-    dict(code="01000", name="net_capital_gain", breakdown=None),
-    dict(code="03150", name="ira_payments", breakdown=None),
-    dict(code="00300", name="taxable_interest", breakdown=None),
-    dict(code="00400", name="tax_exempt_interest", breakdown=None),
-    dict(code="00600", name="oridinary_dividends", breakdown=None),
-    dict(code="00650", name="qualified_dividends", breakdown=None),
-    dict(code="26270", name="partnership_and_s_crop_net_income", breakdown=None),
-    dict(code="02500", name="total_social_security", breakdown=None),
-    dict(code="01700", name="pension_and_annuities", breakdown=None),
-    dict(code="02300", name="unemployment_compensation", breakdown=None),
-    dict(code="00900", name="business_net_income", breakdown=None),
-    dict(code="17000", name="medical_and_dental_deduction", breakdown=None),
-    dict(code="00700", name="salt_refunds", breakdown=None),
-    dict(code="18425", name="salt_amount", breakdown=None),
-    dict(code="06500", name="income_tax", breakdown=None),
-]
-
-
-
-def transform_soi_data(raw_df)
-
-
-    # agi_stub is only 0, so there are only agi breakdowns at the state level
-    # So you can confirm summability for 0 and then forget that national exists
-    # Honestly I think that's a better idea in general. If your states don't add
-    # Up to your national, something's off and you should treat it as an immediate
-    # problem to fix rather than something to be adjusted
-    national_df = raw_df.copy().loc[
-        (raw_df.STATE == "US")
-    ]
-    national_df["ucgid_str"] = "0100000US"
-
-    # You've got agi_stub == 0 in here, which you want to use any time you don't want to
-    # break things up by AGI
-    state_df = raw_df.copy().loc[
-        (raw_df.STATE != "US") &
-        (raw_df.CONG_DISTRICT == 0)
-    ]
-    state_df["ucgid_str"] = "0400000US" + state_df["STATEFIPS"].astype(str).str.zfill(2)
-
-    # This is going to fail because we're missing the single cong district states
-    district_df = raw_df.copy().loc[
-        (raw_df.CONG_DISTRICT > 0)
-    ]
-
-    max_cong_district_by_state = raw_df.groupby('STATE')['CONG_DISTRICT'].transform('max')
-    district_df = raw_df.copy().loc[
-        (raw_df['CONG_DISTRICT'] > 0) | (max_cong_district_by_state == 0)
-    ]
-    district_df = district_df.loc[district_df['STATE'] != 'US']
-    district_df["STATEFIPS"] = district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2)
-    district_df["CONG_DISTRICT"] = (
-        district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2)
-    )
-    district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"]
-    district_df = district_df[~district_df["ucgid_str"].isin(IGNORE_GEO_IDS)]
-
-    assert district_df.shape[0] % 436 == 0
-
-    # And you've got everything you need for all 3 levels of targets:
-    #  1. national_df
-    #  2. state_df
-    #  3. district_df
-    
-    all_df = pd.concat([national_df, state_df, district_df])
-
-    # So I want to get 2 variable categories out of this thing, in long format
-    # 1) EITC, and 2) AGI
-    # There's eitc_child_count, eitc. There's person_count and tax_unit_count
-    # but no household_count. That's why you're doing this though, for a great example
-    # Wide (a new variable per number of children) or Long (breakdown variable is number of children)
-
-    # Marginal in terms of AGI, which this data set is organized with respect to 
-    all_marginals = all_df.copy().loc[all_df.agi_stub == 0]
-    assert all_marginals.shape[0] == 436 + 51 + 1
-
-    # Collect targets from the SOI file
-    records = []
-    for spec in TARGETS:
-        count_col  = f"N{spec['code']}"   # e.g. 'N59661'
-        amount_col = f"A{spec['code']}"   # e.g. 'A59661'
-    
-        df = all_marginals.copy()
-    
-        if spec["breakdown"] is not None:
-            col, val = spec["breakdown"]
-            df[col] = val
-            breakdown_col = col
-        else:
-            breakdown_col = None
-    
-        rec_counts, rec_amounts = make_records(
-            df,
-            count_col   = count_col,
-            amount_col  = amount_col,
-            amount_name = spec["name"],
-            breakdown_col = breakdown_col,
-            multiplier  = 1_000,
-        )
-        records.extend([rec_counts, rec_amounts])
-
-
-    # Custom AGI amount, which doesn't have a count column (it has N1 and N2)
-    temp_df = df[["ucgid_str"]].copy()
-    temp_df["breakdown_variable"] = "one" 
-    temp_df["breakdown_value"] = 1 
-    temp_df["target_variable"] = "agi"
-    temp_df["target_value"] = df["A00100"] * 1_000
-
-    records.append(temp_df)
-
-    # It's notable that the national counts only have agi_stub = 0
-    all_agi_splits = all_df.copy().loc[all_df.agi_stub != 0]
-    assert all_agi_splits.shape[0] % (436 + 51 + 0) == 0
-
-    # Still a bit of work to do at the time of loading, since the breakdown variable
-    # is agi_stub
-    agi_long = make_agi_long(all_agi_splits)
-
-    # We have the distribution and the total amount, let's not go crazy here
-    agi_long = agi_long.loc[agi_long.target_variable != "agi_total_amount"] 
-
-    records.append(agi_long)
-
-    return pd.concat(records)
-
-
-def _get_soi_data(geo_level: str) -> pd.DataFrame:
-    """
-    geo_level ∈ {'National', 'State', 'District'}
-    Returns a DataFrame with all SOI variables for the specified geography level
-    """
-    if geo_level == "National":
-        var_indices = NATIONAL_VARIABLES
-        variable_pull = pull_national_soi_variable
-    elif geo_level == "State":
-        var_indices = GEOGRAPHY_VARIABLES
-        variable_pull = pull_state_soi_variable
-    elif geo_level == "District":
-        var_indices = GEOGRAPHY_VARIABLES
-        variable_pull = pull_district_soi_variable
-    else:
-        raise ValueError("geo_level must be National, State or District")
-
-    df = pd.DataFrame()
-    for variable, identifiers in var_indices.items():
-        count_id, amount_id = identifiers
-        # Pull count data (first identifier)
-        count_df = variable_pull(
-            soi_variable_ident=count_id,
-            variable_name=variable,
-            is_count=float(True),
-        )
-        df = pd.concat([df, count_df], ignore_index=True)
-        # Pull amount data (second identifier)
-        amount_df = variable_pull(
-            soi_variable_ident=amount_id,
-            variable_name=variable,
-            is_count=float(False),
-        )
-        df = pd.concat([df, amount_df], ignore_index=True)
-
-    return df
-
-
-def combine_geography_levels(districts: Optional[bool] = False) -> None:
-    """Combine SOI data across geography levels with validation and rescaling."""
-    national = _get_soi_data("National")
-    state = _get_soi_data("State")
-    if districts:
-        district = _get_soi_data("District")
-
-    # Add state FIPS codes for validation
-    state["STATEFIPS"] = state["GEO_ID"].str[-2:]
-    if districts:
-        district["STATEFIPS"] = district["GEO_ID"].str[-4:-2]
-
-    # Get unique variables and AGI brackets for iteration
-    variables = national["VARIABLE"].unique()
-    agi_brackets = national[["LOWER_BOUND", "UPPER_BOUND"]].drop_duplicates()
-
-    # Validate and rescale state totals against national totals
-    for variable in variables:
-        for is_count in [0.0, 1.0]:  # Process count and amount separately
-            for _, bracket in agi_brackets.iterrows():
-                lower, upper = (
-                    bracket["LOWER_BOUND"],
-                    bracket["UPPER_BOUND"],
-                )
-
-                # Get national total for this variable/bracket/type combination
-                nat_mask = (
-                    (national["VARIABLE"] == variable)
-                    & (national["LOWER_BOUND"] == lower)
-                    & (national["UPPER_BOUND"] == upper)
-                    & (national["IS_COUNT"] == is_count)
-                )
-                us_total = national.loc[nat_mask, "VALUE"].iloc[0]
-
-                # Get state total for this variable/bracket/type combination
-                state_mask = (
-                    (state["VARIABLE"] == variable)
-                    & (state["LOWER_BOUND"] == lower)
-                    & (state["UPPER_BOUND"] == upper)
-                    & (state["IS_COUNT"] == is_count)
-                )
-                state_total = state.loc[state_mask, "VALUE"].sum()
-
-                # Rescale states if they don't match national total
-                if not np.isclose(state_total, us_total, rtol=1e-3):
-                    count_type = "count" if is_count == 1.0 else "amount"
-                    logger.warning(
-                        f"States' sum does not match national total for {variable}/{count_type} "
-                        f"in bracket [{lower}, {upper}]. Rescaling state targets."
-                    )
-                    state.loc[state_mask, "VALUE"] *= us_total / state_total
-
-    if districts:
-        # Validate and rescale district totals against state totals
-        for variable in variables:
-            for is_count in [0.0, 1.0]:  # Process count and amount separately
-                for _, bracket in agi_brackets.iterrows():
-                    lower, upper = (
-                        bracket["LOWER_BOUND"],
-                        bracket["UPPER_BOUND"],
-                    )
-
-                    # Create masks for this variable/bracket/type combination
-                    state_mask = (
-                        (state["VARIABLE"] == variable)
-                        & (state["LOWER_BOUND"] == lower)
-                        & (state["UPPER_BOUND"] == upper)
-                        & (state["IS_COUNT"] == is_count)
-                    )
-                    district_mask = (
-                        (district["VARIABLE"] == variable)
-                        & (district["LOWER_BOUND"] == lower)
-                        & (district["UPPER_BOUND"] == upper)
-                        & (district["IS_COUNT"] == is_count)
-                    )
-
-                # Get state totals indexed by STATEFIPS
-                state_totals = state.loc[state_mask].set_index("STATEFIPS")[
-                    "VALUE"
-                ]
-
-                # Get district totals grouped by STATEFIPS
-                district_totals = (
-                    district.loc[district_mask]
-                    .groupby("STATEFIPS")["VALUE"]
-                    .sum()
-                )
-
-                # Check and rescale districts for each state
-                for fips, d_total in district_totals.items():
-                    s_total = state_totals.get(fips)
-
-                    if s_total is not None and not np.isclose(
-                        d_total, s_total, rtol=1e-3
-                    ):
-                        count_type = "count" if is_count == 1.0 else "amount"
-                        logger.warning(
-                            f"Districts' sum does not match {fips} state total for {variable}/{count_type} "
-                            f"in bracket [{lower}, {upper}]. Rescaling district targets."
-                        )
-                        rescale_mask = district_mask & (
-                            district["STATEFIPS"] == fips
-                        )
-                        district.loc[rescale_mask, "VALUE"] *= (
-                            s_total / d_total
-                        )
-
-    # Combine all data
-    combined = pd.concat(
-        [
-            national,
-            state.drop(columns="STATEFIPS"),
-            (
-                district.drop(columns="STATEFIPS")
-                if districts
-                else pd.DataFrame(columns=national.columns)
-            ),
-        ],
-        ignore_index=True,
-    ).sort_values(["GEO_ID", "VARIABLE", "LOWER_BOUND"])
-
-    combined["DATA_SOURCE"] = "soi"
-    combined["BREAKDOWN_VARIABLE"] = "adjusted_gross_income"
-
-    combined = combined[
-        [
-            "DATA_SOURCE",
-            "GEO_ID",
-            "GEO_NAME",
-            "VARIABLE",
-            "VALUE",
-            "IS_COUNT",
-            "BREAKDOWN_VARIABLE",
-            "LOWER_BOUND",
-            "UPPER_BOUND",
-        ]
-    ]
-
-    # Save combined data
-    out_path = CALIBRATION_FOLDER / "soi.csv"
-    combined.to_csv(out_path, index=False)
-    logger.info(f"Combined SOI targets saved to {out_path}")
-
-
-def main() -> None:
-    combine_geography_levels()
-
-
-if __name__ == "__main__":
-    main()

From 6d482e7fa50f4f5dfed9b1f9b2a514e9652505c9 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Thu, 14 Aug 2025 17:59:23 -0400
Subject: [PATCH 12/27] all major targets loaded

---
 Makefile                                      |   6 +-
 .../db/create_initial_strata.py               |  72 +++++
 policyengine_us_data/db/etl_age.py            | 101 ++-----
 policyengine_us_data/db/etl_eitc_only.py      | 216 -------------
 policyengine_us_data/db/etl_irs_soi.py        | 284 ++++++++++++------
 policyengine_us_data/db/etl_medicaid.py       |  28 +-
 policyengine_us_data/db/etl_snap.py           | 160 ++--------
 policyengine_us_data/db/temp.py               |  57 ++++
 .../make_district_mapping.py                  | 254 ++++++++++++++++
 policyengine_us_data/utils/census.py          |  23 ++
 policyengine_us_data/utils/db.py              |  61 ++++
 11 files changed, 721 insertions(+), 541 deletions(-)
 create mode 100644 policyengine_us_data/db/create_initial_strata.py
 delete mode 100644 policyengine_us_data/db/etl_eitc_only.py
 create mode 100644 policyengine_us_data/db/temp.py
 create mode 100644 policyengine_us_data/storage/calibration_targets/make_district_mapping.py
 create mode 100644 policyengine_us_data/utils/db.py

diff --git a/Makefile b/Makefile
index 4124babc..01999135 100644
--- a/Makefile
+++ b/Makefile
@@ -61,7 +61,11 @@ documentation-dev:
 
 database:
 	python policyengine_us_data/db/create_database_tables.py
-	python policyengine_us_data/db/load_age_targets.py
+	python policyengine_us_data/db/create_initial_strata.py
+	python policyengine_us_data/db/etl_age.py
+	python policyengine_us_data/db/etl_medicaid.py
+	python policyengine_us_data/db/etl_snap.py
+	python policyengine_us_data/db/etl_irs_soi.py
 
 clean-database:
 	rm *.db
diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py
new file mode 100644
index 00000000..0a7e7f7a
--- /dev/null
+++ b/policyengine_us_data/db/create_initial_strata.py
@@ -0,0 +1,72 @@
+from typing import Dict
+
+import pandas as pd
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+from sqlmodel import SQLModel, Session, select
+
+
+from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID
+from policyengine_us_data.db.create_database_tables import (
+    Stratum,
+    StratumConstraint,
+)
+
+
+
+def main():
+    # Get the implied hierarchy by the UCGID enum -------- 
+    rows = []
+    for node in UCGID:
+        codes = node.get_hierarchical_codes()
+        rows.append({
+            "name":   node.name,
+            "code":   codes[0],
+            "parent": codes[1] if len(codes) > 1 else None
+        })
+    
+    hierarchy_df = (
+        pd.DataFrame(rows)
+          .sort_values(["parent", "code"], na_position="first")
+          .reset_index(drop=True)
+    )
+    
+
+    DATABASE_URL = "sqlite:///policy_data.db"
+    engine = create_engine(DATABASE_URL)
+
+    Session = sessionmaker(bind=engine)
+    session = Session()
+
+    # map the ucgid_str 'code' to auto-generated 'stratum_id'
+    code_to_stratum_id: Dict[str, int] = {}
+    
+    for _, row in hierarchy_df.iterrows():
+        parent_code = row["parent"]
+        
+        parent_id = code_to_stratum_id.get(parent_code) if parent_code else None
+
+        new_stratum = Stratum(
+            parent_stratum_id=parent_id,
+            notes=f'{row["name"]} (ucgid {row["code"]})',
+            stratum_group_id=1,
+        )
+
+        new_stratum.constraints_rel = [
+            StratumConstraint(
+                constraint_variable="ucgid_str",
+                operation="in",
+                value=row["code"],
+            )
+        ]
+        
+        session.add(new_stratum)
+        
+        session.flush()
+        
+        code_to_stratum_id[row["code"]] = new_stratum.stratum_id
+
+    session.commit()
+
+if __name__ == "__main__":
+    main()
diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py
index e168317b..084a43d6 100644
--- a/policyengine_us_data/db/etl_age.py
+++ b/policyengine_us_data/db/etl_age.py
@@ -12,6 +12,7 @@
     StratumConstraint,
     Target,
 )
+from policyengine_us_data.utils.census import get_census_docs, pull_acs_table
 
 
 LABEL_TO_SHORT = {
@@ -32,67 +33,11 @@
     "Estimate!!Total!!Total population!!AGE!!70 to 74 years": "70-74",
     "Estimate!!Total!!Total population!!AGE!!75 to 79 years": "75-79",
     "Estimate!!Total!!Total population!!AGE!!80 to 84 years": "80-84",
-    "Estimate!!Total!!Total population!!AGE!!85 years and over": "85-inf",
+    "Estimate!!Total!!Total population!!AGE!!85 years and over": "85-999",
 }
 AGE_COLS = list(LABEL_TO_SHORT.values())
 
 
-def extract_docs(year=2023):
-    docs_url = (
-        f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json"
-    )
-
-    try:
-        docs_response = requests.get(docs_url)
-        docs_response.raise_for_status()
-
-        docs = docs_response.json()
-        docs["year"] = year
-
-    except requests.exceptions.RequestException as e:
-        print(f"Error during API request: {e}")
-        raise
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        raise
-    return docs
-
-
-def extract_age_data(geo, year=2023):
-    base_url = (
-        f"https://api.census.gov/data/{year}/acs/acs1/subject?get=group(S0101)"
-    )
-
-    if geo == "State":
-        url = f"{base_url}&for=state:*"
-    elif geo == "District":
-        url = f"{base_url}&for=congressional+district:*"
-    elif geo == "National":
-        url = f"{base_url}&for=us:*"
-    else:
-        raise ValueError(
-            "geo must be either 'National', 'State', or 'District'"
-        )
-
-    try:
-        response = requests.get(url)
-        response.raise_for_status()
-
-        data = response.json()
-
-        headers = data[0]
-        data_rows = data[1:]
-        df = pd.DataFrame(data_rows, columns=headers)
-
-    except requests.exceptions.RequestException as e:
-        print(f"Error during API request: {e}")
-        raise
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        raise
-    return df
-
-
 def transform_age_data(age_data, docs):
     df = age_data.copy()
 
@@ -131,13 +76,14 @@ def transform_age_data(age_data, docs):
         var_name="age_range",
         value_name="value",
     )
-    age_bounds = df_long["age_range"].str.split("-", expand=True)
-    df_long["age_greater_than_or_equal_to"] = (
-        age_bounds[0].str.replace("+", "").astype(int)
-    )
-    df_long["age_less_than_or_equal_to"] = pd.to_numeric(age_bounds[1])
+    age_bounds = df_long["age_range"].str.split("-", expand=True).astype(int)
+    age_bounds.columns = ["ge", "le"]
+    age_bounds[['gt']] = age_bounds[["ge"]] - 1
+    age_bounds[['lt']] = age_bounds[["le"]] + 1
+
+    df_long["age_greater_than"] = age_bounds[["gt"]]
+    df_long["age_less_than"] = age_bounds[["lt"]] 
     df_long["variable"] = "person_count"
-    df_long["period"] = docs["year"]
     df_long["reform_id"] = 0
     df_long["source_id"] = 1
     df_long["active"] = True
@@ -149,7 +95,7 @@ def get_parent_geo(geo):
     return {"National": None, "State": "National", "District": "State"}[geo]
 
 
-def load_age_data(df_long, geo, stratum_lookup={}):
+def load_age_data(df_long, geo, year, stratum_lookup={}):
 
     # Quick data quality check before loading ----
     if geo == "National":
@@ -192,6 +138,7 @@ def load_age_data(df_long, geo, stratum_lookup={}):
         )
 
         # Create constraints and link them to the parent's relationship attribute.
+        # TODO: greater_than_or_equal_to to just greater than!
         new_stratum.constraints_rel = [
             StratumConstraint(
                 constraint_variable="ucgid_str",
@@ -200,18 +147,18 @@ def load_age_data(df_long, geo, stratum_lookup={}):
             ),
             StratumConstraint(
                 constraint_variable="age",
-                operation="greater_than_or_equal",
-                value=str(row["age_greater_than_or_equal_to"]),
+                operation="greater_than",
+                value=str(row["age_greater_than"]),
             ),
         ]
 
-        age_lt_value = row["age_less_than_or_equal_to"]
+        age_lt_value = row["age_less_than"]
         if not np.isinf(age_lt_value):
             new_stratum.constraints_rel.append(
                 StratumConstraint(
                     constraint_variable="age",
                     operation="less_than",
-                    value=str(age_lt_value + 1),
+                    value=str(row["age_less_than"]),
                 )
             )
 
@@ -219,7 +166,7 @@ def load_age_data(df_long, geo, stratum_lookup={}):
         new_stratum.targets_rel.append(
             Target(
                 variable=row["variable"],
-                period=row["period"],
+                period=year,
                 value=row["value"],
                 source_id=row["source_id"],
                 active=row["active"],
@@ -243,18 +190,24 @@ def load_age_data(df_long, geo, stratum_lookup={}):
 if __name__ == "__main__":
 
     # --- ETL: Extract, Transform, Load ----
+    year = 2023
 
     # ---- Extract ----------
-    docs = extract_docs(2023)
-    national_df = extract_age_data("National", 2023)
-    state_df = extract_age_data("State", 2023)
+    docs = get_census_docs(year)
+    national_df = pull_acs_table("S0101", "National", year)
+    state_df = pull_acs_table("S0101", "State", year)
+    district_df = pull_acs_table("S0101", "District", year)
 
     # --- Transform ----------
     long_national_df = transform_age_data(national_df, docs)
     long_state_df = transform_age_data(state_df, docs)
+    long_district_df = transform_age_data(district_df, docs)
 
     # --- Load --------
-    national_strata_lku = load_age_data(long_national_df, "National")
+    national_strata_lku = load_age_data(long_national_df, "National", year)
     state_strata_lku = load_age_data(
-        long_state_df, "State", national_strata_lku
+        long_state_df, "State", year, national_strata_lku
+    )
+    load_age_data(
+        long_district_df, "District", year, state_strata_lku
     )
diff --git a/policyengine_us_data/db/etl_eitc_only.py b/policyengine_us_data/db/etl_eitc_only.py
deleted file mode 100644
index 20d52cef..00000000
--- a/policyengine_us_data/db/etl_eitc_only.py
+++ /dev/null
@@ -1,216 +0,0 @@
-import logging
-import requests
-from pathlib import Path
-import io
-
-import pandas as pd
-import numpy as np
-from sqlalchemy import create_engine
-from sqlalchemy.orm import sessionmaker
-
-from policyengine_us_data.db.create_database_tables import (
-    Stratum,
-    StratumConstraint,
-    Target,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-def extract_eitc_data():
-    # IRS Table 2.5, Tax Year 2020S
-    url = "https://www.irs.gov/pub/irs-soi/20in25ic.xls"
-    r = requests.get(url, timeout=30)
-    r.raise_for_status()
-
-    # Pandas uses xlrd to open .xls
-    xls = pd.ExcelFile(io.BytesIO(r.content), engine="xlrd")
-    sheets = {name: xls.parse(name, header=None) for name in xls.sheet_names}
-
-    raw = sheets[xls.sheet_names[0]]
-    return raw
-
-
-def transform_eitc_data(raw_data):
-    # This is not ideal from a data processing standpoint, but it's too much
-    # effort to fully parse this hierarchical XLS for a few data points
-    # At least the full lineage is represented from the source
-
-    zero_children_returns = raw_data.iloc[8, 25]
-    zero_children_amount = raw_data.iloc[8, 26] * 1000
-
-    one_child_returns = raw_data.iloc[8, 39]
-    one_child_amount = raw_data.iloc[8, 40] * 1000
-
-    two_children_returns = raw_data.iloc[8, 57]
-    two_children_amount = raw_data.iloc[8, 58] * 1000
-
-    three_plus_children_returns = raw_data.iloc[8, 73]
-    three_plus_children_amount = raw_data.iloc[8, 74] * 1000
-
-    assert zero_children_returns == 7636714
-    assert zero_children_amount == 2255068000
-
-    df_long = pd.DataFrame(
-        [
-            [
-                "0100000US",
-                "children_equal_to",
-                0,
-                "tax_unit_count",
-                zero_children_returns,
-            ],
-            [
-                "0100000US",
-                "children_equal_to",
-                1,
-                "tax_unit_count",
-                one_child_returns,
-            ],
-            [
-                "0100000US",
-                "children_equal_to",
-                2,
-                "tax_unit_count",
-                two_children_returns,
-            ],
-            [
-                "0100000US",
-                "children_greater_or_equal_to",
-                3,
-                "tax_unit_count",
-                three_plus_children_returns,
-            ],
-            [
-                "0100000US",
-                "children_equal_to",
-                0,
-                "eitc",
-                zero_children_amount,
-            ],
-            ["0100000US", "children_equal_to", 1, "eitc", one_child_returns],
-            [
-                "0100000US",
-                "children_equal_to",
-                2,
-                "eitc",
-                two_children_returns,
-            ],
-            [
-                "0100000US",
-                "children_greater_or_equal_to",
-                3,
-                "eitc",
-                three_plus_children_returns,
-            ],
-        ]
-    )
-
-    df_long.columns = [
-        "ucgid",
-        "constraint",
-        "constraint_value",
-        "variable",
-        "value",
-    ]
-
-    df_long["period"] = 2020
-    df_long["reform_id"] = 0
-    df_long["source_id"] = 2
-    df_long["active"] = True
-
-    return df_long
-
-
-def load_eitc_data(df_long):
-
-    DATABASE_URL = "sqlite:///policy_data.db"
-    engine = create_engine(DATABASE_URL)
-
-    Session = sessionmaker(bind=engine)
-    session = Session()
-
-    ucgid = df_long.iloc[0]["ucgid"]
-    for num_children in [0, 1, 2, 3]:
-        note = f"eitc_child_count: {num_children}, Geo: {ucgid}"
-        new_stratum = Stratum(
-            parent_stratum_id=None, stratum_group_id=0, notes=note
-        )
-
-        new_stratum.constraints_rel = [
-            StratumConstraint(
-                constraint_variable="ucgid",
-                operation="equals",
-                value=ucgid,
-            ),
-        ]
-
-        if num_children <= 2:
-            new_stratum.constraints_rel.append(
-                StratumConstraint(
-                    constraint_variable="eitc_child_count",
-                    operation="equals",
-                    value=str(num_children),
-                ),
-            )
-        elif num_children > 2:
-            new_stratum.constraints_rel.append(
-                StratumConstraint(
-                    constraint_variable="eitc_child_count",
-                    operation="greater_or_equal_than",
-                    value=str(3),
-                ),
-            )
-
-        rows = df_long.loc[df_long["constraint_value"] == num_children]
-        count_target = rows.loc[rows.variable == "tax_unit_count"][
-            "value"
-        ].values[0]
-        amount_target = rows.loc[rows.variable == "eitc"]["value"].values[0]
-
-        # Avoiding magic numbers in the load step
-        count_active = rows.loc[rows.variable == "tax_unit_count"][
-            "active"
-        ].values[0]
-        amount_active = rows.loc[rows.variable == "eitc"]["active"].values[0]
-
-        period = rows.iloc[0]["period"]
-        source_id = rows.iloc[0]["source_id"]
-
-        new_stratum.targets_rel = [
-            Target(
-                variable="eitc",
-                period=period,
-                value=amount_target,
-                source_id=source_id,
-                active=amount_active,
-            ),
-            Target(
-                variable="tax_unit_count",
-                period=period,
-                value=amount_target,
-                source_id=source_id,
-                active=count_active,
-            ),
-        ]
-
-        session.add(new_stratum)
-        session.flush()
-        print(new_stratum.stratum_id)
-
-    session.commit()
-
-
-if __name__ == "__main__":
-
-    # --- ETL: Extract, Transform, Load ----
-
-    # ---- Extract ----------
-    national_df = extract_eitc_data()
-
-    # --- Transform ----------
-    long_national_df = transform_eitc_data(national_df)
-
-    # --- Load --------
-    state_strata_lku = load_eitc_data(long_national_df)
diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
index d9eeb503..c93eb593 100644
--- a/policyengine_us_data/db/etl_irs_soi.py
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -1,65 +1,36 @@
-from pathlib import Path
-from typing import List, Optional, Sequence, Dict, Tuple, Any, Union
+from typing import Optional
 
 import numpy as np
 import pandas as pd
 
-from sqlalchemy import create_engine
-from sqlalchemy.orm import sessionmaker
+from sqlmodel import Session, create_engine
 
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
     Target,
 )
-
-
-
-
-"""Utilities to pull AGI targets from the IRS SOI data files."""
-
-# Congressional districts have one fewer level than the national and state
-# They're missing the million plus category
-#  ("No AGI Stub") is a specific, intentional category used by the IRS in its summary data files.
-#
-#SOI_COLUMNS = [
-#    "Under $1",
-#    "$1 under $10,000",
-#    "$10,000 under $25,000",
-#    "$25,000 under $50,000",
-#    "$50,000 under $75,000",
-#    "$75,000 under $100,000",
-#    "$100,000 under $200,000",
-#    "$200,000 under $500,000",
-#    "$500,000 or more",
-#]
-#
-#AGI_STUB_TO_BAND = {i + 1: band for i, band in enumerate(SOI_COLUMNS)}
-#
-#AGI_BOUNDS = {
-#    "Under $1": (-np.inf, 1),
-#    "$1 under $10,000": (1, 10_000),
-#    "$10,000 under $25,000": (10_000, 25_000),
-#    "$25,000 under $50,000": (25_000, 50_000),
-#    "$50,000 under $75,000": (50_000, 75_000),
-#    "$75,000 under $100,000": (75_000, 100_000),
-#    "$100,000 under $200,000": (100_000, 200_000),
-#    "$200,000 under $500,000": (200_000, 500_000),
-#    "$500,000 or more": (500_000, np.inf),
-#}
-#
-##NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"}
-#
-IGNORE_GEO_IDS = {
-    "0400000US72",  # Puerto Rico (state level)
-    "5001800US7298",  # Puerto Rico
-    "5001800US6098",  # American Samoa
-    "5001800US6698",  # Guam
-    "5001800US6998",  # Northern Mariana Islands
-    "5001800US7898",  # U.S. Virgin Islands
+from policyengine_us_data.utils.db import get_stratum_by_id, get_simple_stratum_by_ucgid, get_root_strata, get_stratum_children, get_stratum_parent
+from policyengine_us_data.utils.census import TERRITORY_UCGIDS
+from policyengine_us_data.storage.calibration_targets.make_district_mapping import get_district_mapping
+
+
+"""See the 22incddocguide.docx manual from the IRS SOI"""
+# Let's make this work with strict inequalities
+# Interpret Language: '$10,000 under $25,000'
+epsilon = 0.005  # Half a penny
+AGI_STUB_TO_INCOME_RANGE = {
+    1: (-np.inf, 1),
+    2: (1 - epsilon, 10_000),
+    3: (10_000 - epsilon , 25_000),
+    4: (25_000 - epsilon, 50_000),
+    5: (50_000 - epsilon, 75_000),
+    6: (75_000 - epsilon, 100_000),
+    7: (100_000 - epsilon, 200_000),
+    8: (200_000 - epsilon, 500_000),
+    9: (500_000 - epsilon, np.inf),
 }
 
-
 def create_records(df, breakdown_variable, target_variable):
     """Transforms a DataFrame subset into a standardized list of records."""
     temp_df = df[["ucgid_str"]].copy()
@@ -123,10 +94,39 @@ def make_agi_long(df: pd.DataFrame) -> pd.DataFrame:
                  "breakdown_value",
                  "target_variable",
                  "target_value"]]
-    return (
-        long.sort_values(["ucgid_str", "breakdown_value", "target_variable"])
-            .reset_index(drop=True)
-    )
+
+    return [
+        df.sort_values(by='ucgid_str').reset_index(drop=True)
+        for name, df in long.groupby(['breakdown_value', 'target_variable'])
+    ]
+
+
+def convert_district_data(
+    input_df: pd.DataFrame,
+    mapping_matrix: np.ndarray,  # 436 x 436A
+    new_district_codes
+) -> pd.DataFrame:
+    """Transforms data from pre- to post- 2020 census districts"""
+    df = input_df.copy()
+    old_districts_df = df[df['ucgid_str'].str.startswith("5001800US")].copy()
+    old_districts_df = old_districts_df.sort_values('ucgid_str').reset_index(drop=True)
+    old_values = old_districts_df['target_value'].to_numpy()
+    new_values = mapping_matrix.T @ old_values
+
+    # Create a new DataFrame for the transformed data, preserving the original schema.
+    new_districts_df = pd.DataFrame({
+        'ucgid_str': new_district_codes,
+        'breakdown_variable': old_districts_df['breakdown_variable'],
+        'breakdown_value': old_districts_df['breakdown_value'],
+        'target_variable': old_districts_df['target_variable'],
+        'target_value': new_values
+    })
+
+    other_geos_df = df[~df['ucgid_str'].str.startswith("5001800US")].copy()
+
+    final_df = pd.concat([other_geos_df, new_districts_df], ignore_index=True)
+
+    return final_df
 
 
 def extract_soi_data() -> pd.DataFrame:
@@ -195,7 +195,7 @@ def transform_soi_data(raw_df):
         district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2)
     )
     district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"]
-    district_df = district_df[~district_df["ucgid_str"].isin(IGNORE_GEO_IDS)]
+    district_df = district_df[~district_df["ucgid_str"].isin(TERRITORY_UCGIDS)]
 
     assert district_df.shape[0] % 436 == 0
 
@@ -244,12 +244,15 @@ def transform_soi_data(raw_df):
     all_agi_splits = all_df.copy().loc[all_df.agi_stub != 0]
     assert all_agi_splits.shape[0] % (436 + 51 + 0) == 0
 
-    agi_long = make_agi_long(all_agi_splits)
-    agi_long = agi_long.loc[agi_long.target_variable != "agi_total_amount"] 
+    agi_long_records = make_agi_long(all_agi_splits)
 
-    records.append(agi_long)
+    records.extend(agi_long_records)
 
-    return pd.concat(records)
+    # Pre- to Post- 2020 Census redisticting
+    mapping = get_district_mapping()
+    converted = [convert_district_data(r, mapping['mapping_matrix'], mapping['new_codes']) for r in records]
+
+    return converted 
 
 
 def load_soi_data(long_dfs, year):
@@ -257,11 +260,10 @@ def load_soi_data(long_dfs, year):
     DATABASE_URL = "sqlite:///policy_data.db"
     engine = create_engine(DATABASE_URL)
 
-    Session = sessionmaker(bind=engine)
-    session = Session()
+    session = Session(engine)
 
     # Load EITC data -------------------------------------------------------- 
-    # NOTE: obviously this is not especially robust ---
+    # Obviously this is not especially robust ---
     eitc_data = {'0': (long_dfs[0], long_dfs[1]),
                  '1': (long_dfs[2], long_dfs[3]),
                  '2': (long_dfs[4], long_dfs[5]),
@@ -302,8 +304,8 @@ def load_soi_data(long_dfs, year):
                new_stratum.constraints_rel.append(
                    StratumConstraint(
                        constraint_variable="eitc_children",
-                       operation="greater_than_or_equal_to",
-                       value='3',
+                       operation="greater_than",
+                       value='2',
                    )
                )
             else:
@@ -316,13 +318,14 @@ def load_soi_data(long_dfs, year):
                )
 
             new_stratum.targets_rel = [
-                Target(
-                    variable="tax_unit_count",
-                    period=year,
-                    value=eitc_count_i.iloc[i][["target_value"]].values[0],
-                    source_id=5,
-                    active=True,
-                ),
+                # It's already complex enough
+                #Target(
+                #    variable="tax_unit_count",
+                #    period=year,
+                #    value=eitc_count_i.iloc[i][["target_value"]].values[0],
+                #    source_id=5,
+                #    active=True,
+                #),
                 Target(
                     variable="eitc",
                     period=year,
@@ -340,26 +343,104 @@ def load_soi_data(long_dfs, year):
             elif len(ucgid_i) == 11: 
                  stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id
 
+    session.commit()
 
     # No breakdown variables in this set 
     for j in range(8, 42, 2):
-        print(long_dfs[j])  # count
-        print(long_dfs[j + 1])  # amount
-
-        # Why are we making strata here? You have a lot of these to run through
         count_j, amount_j = long_dfs[j], long_dfs[j + 1] 
+        amount_variable_name = amount_j.iloc[0][["target_variable"]].values[0]
+        print(f"Loading amount data for IRS SOI data on {amount_variable_name}")
         for i in range(count_j.shape[0]):
             ucgid_i = count_j[['ucgid_str']].iloc[i].values[0]
-            # If there's no breakdown variable, is this a new geo?
-            # The problem is, it's vary difficult to search for a geography
-            # That's already in existance
-            note = f"Geo: {ucgid_i}"
 
-            if len(ucgid_i) == 9:  # National.
-                new_stratum = Stratum(
-                    parent_stratum_id=None, stratum_group_id=0, notes=note
+            # Reusing an existing stratum this time, since there is no breakdown
+            stratum = get_simple_stratum_by_ucgid(session, ucgid_i)
+            amount_value = amount_j.iloc[i][["target_value"]].values[0]
+
+            stratum.targets_rel.append(
+                # NOTE: If I do the counts, I'm going to need to explode the strata for the vars != 0
+                # OR, create new variables like qbid_tax_unit_count which requires adding stuff to -us
+                # AND, it's already complex enough -----
+                #Target(
+                #    variable="tax_unit_count",
+                #    period=year,
+                #    value=count_j.iloc[i][["target_value"]].values[0],
+                #    source_id=5,
+                #    active=True,
+                #),
+                Target(
+                    variable=amount_variable_name,
+                    period=year,
+                    value=amount_value,
+                    source_id=5,
+                    active=True,
                 )
-            elif len(ucgid_i) == 11:  # State 
+            )
+
+            session.add(stratum)
+            session.flush()
+
+    session.commit()
+
+    # Adjusted Gross Income ------ 
+    agi_values = long_dfs[42]
+
+    for i in range(agi_values.shape[0]):
+        ucgid_i = agi_values[['ucgid_str']].iloc[i].values[0]
+        stratum = get_simple_stratum_by_ucgid(session, ucgid_i)
+        stratum.targets_rel.append(
+            Target(
+                variable="agi",
+                period=year,
+                value=agi_values.iloc[i][["target_value"]].values[0],
+                source_id=5,
+                active=True,
+            )
+        )
+        session.add(stratum)
+        session.flush()
+    
+    session.commit()
+
+    agi_person_count_dfs = [df for df in long_dfs[43:] if df['target_variable'].iloc[0] == 'agi_person_count']
+
+    for agi_df in agi_person_count_dfs:
+        agi_stub = agi_df.iloc[0][["breakdown_value"]].values[0]
+        agi_income_lower, agi_income_upper = AGI_STUB_TO_INCOME_RANGE[agi_stub]
+
+        # Make a National Stratum for each AGI Stub, even though there's no national target
+        # There no national target because the data set only has agi_stub = 0 for national
+        nat_stratum = Stratum(
+            parent_stratum_id=None, stratum_group_id=0, notes=note
+        )
+        nat_stratum.constraints_rel.extend([
+           StratumConstraint(
+               constraint_variable="ucgid_str",
+               operation="in",
+               value=ucgid_i,
+           ),
+           StratumConstraint(
+               constraint_variable="agi",
+               operation="greater_than",
+               value=str(agi_income_lower),
+           ),
+           StratumConstraint(
+               constraint_variable="agi",
+               operation="less_than",
+               value=str(agi_income_upper),
+           ),
+        ])
+        session.add(nat_stratum)
+        session.flush()
+ 
+        stratum_lookup = {"National": nat_stratum.stratum_id, "State": {}, "District": {}}
+        for i in range(agi_df.shape[0]):
+            ucgid_i = agi_df[['ucgid_str']].iloc[i].values[0]
+            note = f"Geo: {ucgid_i}, AGI > {agi_income_lower}, AGI < {agi_income_upper}"
+
+            person_count = agi_df.iloc[i][["target_value"]].values[0]
+
+            if len(ucgid_i) == 11:  # State 
                 new_stratum = Stratum(
                     parent_stratum_id=stratum_lookup["National"],
                     stratum_group_id=0,
@@ -371,26 +452,28 @@ def load_soi_data(long_dfs, year):
                     stratum_group_id=0,
                     notes=note
                 )
-
-            new_stratum.constraints_rel = [
+            new_stratum.constraints_rel.extend([
                StratumConstraint(
                    constraint_variable="ucgid_str",
                    operation="in",
                    value=ucgid_i,
                ),
-            ]
+               StratumConstraint(
+                   constraint_variable="agi",
+                   operation="greater_than",
+                   value=str(agi_income_lower),
+               ),
+               StratumConstraint(
+                   constraint_variable="agi",
+                   operation="less_than",
+                   value=str(agi_income_upper),
+               ),
+            ])
             new_stratum.targets_rel = [
                 Target(
-                    variable="tax_unit_count",
+                    variable="person_count",
                     period=year,
-                    value=count_j.iloc[i][["target_value"]].values[0],
-                    source_id=5,
-                    active=True,
-                ),
-                Target(
-                    variable=amount_j.iloc[0][["target_variable"]].values[0],
-                    period=year,
-                    value=amount_j.iloc[i][["target_value"]].values[0],
+                    value=person_count,
                     source_id=5,
                     active=True,
                 )
@@ -407,12 +490,17 @@ def load_soi_data(long_dfs, year):
     session.commit()
 
 
-
-def main() -> None:
+def main():
     year = 2022  # NOTE: predates the finalization of the 2020 Census redistricting
+
+    # Extract -----------------------
     raw_df = extract_soi_data()
 
-    long_dfs = transform_soi_data(raw_df):
+    # Transform ---------------------
+    long_dfs = transform_soi_data(raw_df)
+
+    # Load ---------------------
+    load_soi_data(long_dfs, year)
 
 
 if __name__ == "__main__":
diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py
index d1babe31..ec16ac71 100644
--- a/policyengine_us_data/db/etl_medicaid.py
+++ b/policyengine_us_data/db/etl_medicaid.py
@@ -9,22 +9,7 @@
     StratumConstraint,
     Target,
 )
-
-
-# State abbreviation to FIPS code mapping
-state_fips_map = {
-    'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06',
-    'CO': '08', 'CT': '09', 'DE': '10', 'FL': '12', 'GA': '13',
-    'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19',
-    'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24',
-    'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28', 'MO': '29',
-    'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34',
-    'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39',
-    'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45',
-    'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50',
-    'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55', 'WY': '56',
-    'DC': '11'
-}
+from policyengine_us_data.utils.census import STATE_ABBREV_TO_FIPS
 
 
 def extract_medicaid_data(year):
@@ -63,7 +48,7 @@ def transform_medicaid_data(state_admin_df, cd_survey_df, year):
         ["State Abbreviation", "Reporting Period", "Total Medicaid Enrollment"]
     ]
 
-    state_df["FIPS"] = state_df["State Abbreviation"].map(state_fips_map)
+    state_df["FIPS"] = state_df["State Abbreviation"].map(STATE_ABBREV_TO_FIPS)
 
     cd_df = cd_survey_df[["GEO_ID", "state", "congressional district", "S2704_C02_006E"]]
 
@@ -100,7 +85,7 @@ def load_medicaid_data(long_state, long_cd, year):
     nat_stratum.constraints_rel = [
         StratumConstraint(
             constraint_variable="ucgid_str",
-            operation="equals",
+            operation="in",
             value="0100000US",
         ),
         StratumConstraint(
@@ -128,7 +113,7 @@ def load_medicaid_data(long_state, long_cd, year):
         new_stratum.constraints_rel = [
             StratumConstraint(
                 constraint_variable="ucgid_str",
-                operation="equals",
+                operation="in",
                 value=row["ucgid_str"],
             ),
             StratumConstraint(
@@ -162,7 +147,7 @@ def load_medicaid_data(long_state, long_cd, year):
         new_stratum.constraints_rel = [
             StratumConstraint(
                 constraint_variable="ucgid_str",
-                operation="equals",
+                operation="in",
                 value=row["ucgid_str"],
             ),
             StratumConstraint(
@@ -190,8 +175,11 @@ def load_medicaid_data(long_state, long_cd, year):
 
     year = 2023
 
+    # Extract ------------------------------
     cd_survey_df, state_admin_df = extract_medicaid_data(year)
 
+    # Transform -------------------
     long_state, long_cd = transform_medicaid_data(state_admin_df, cd_survey_df, year)
 
+    # Load -----------------------
     load_medicaid_data(long_state, long_cd, year)
diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py
index a0f20133..f9a172a9 100644
--- a/policyengine_us_data/db/etl_snap.py
+++ b/policyengine_us_data/db/etl_snap.py
@@ -17,68 +17,11 @@
     Target,
 )
 from policyengine_us_data.utils.census import (
-    get_census_docs,
     pull_acs_table,
     STATE_NAME_TO_FIPS,
 )
 
 
-STATE_NAME_TO_FIPS = {
-    "Alabama": "01",
-    "Alaska": "02",
-    "Arizona": "04",
-    "Arkansas": "05",
-    "California": "06",
-    "Colorado": "08",
-    "Connecticut": "09",
-    "Delaware": "10",
-    "District of Columbia": "11",
-    "Florida": "12",
-    "Georgia": "13",
-    "Hawaii": "15",
-    "Idaho": "16",
-    "Illinois": "17",
-    "Indiana": "18",
-    "Iowa": "19",
-    "Kansas": "20",
-    "Kentucky": "21",
-    "Louisiana": "22",
-    "Maine": "23",
-    "Maryland": "24",
-    "Massachusetts": "25",
-    "Michigan": "26",
-    "Minnesota": "27",
-    "Mississippi": "28",
-    "Missouri": "29",
-    "Montana": "30",
-    "Nebraska": "31",
-    "Nevada": "32",
-    "New Hampshire": "33",
-    "New Jersey": "34",
-    "New Mexico": "35",
-    "New York": "36",
-    "North Carolina": "37",
-    "North Dakota": "38",
-    "Ohio": "39",
-    "Oklahoma": "40",
-    "Oregon": "41",
-    "Pennsylvania": "42",
-    "Rhode Island": "44",
-    "South Carolina": "45",
-    "South Dakota": "46",
-    "Tennessee": "47",
-    "Texas": "48",
-    "Utah": "49",
-    "Vermont": "50",
-    "Virginia": "51",
-    "Washington": "53",
-    "West Virginia": "54",
-    "Wisconsin": "55",
-    "Wyoming": "56",
-}
-
-# Administrative data ------------------------------------------------
-
 def extract_administrative_snap_data(year=2023):
     """
     Downloads and extracts annual state-level SNAP data from the USDA FNS zip file.
@@ -122,6 +65,10 @@ def extract_administrative_snap_data(year=2023):
     return zipfile.ZipFile(io.BytesIO(response.content))
 
 
+def extract_survey_snap_data(year):
+    return pull_acs_table("S2201", "District", year)
+
+
 def transform_administrative_snap_data(zip_file, year):
     filename = f"FY{str(year)[-2:]}.xlsx"
     with zip_file.open(filename) as f:
@@ -185,6 +132,22 @@ def transform_administrative_snap_data(zip_file, year):
     return df_states
 
 
+def transform_survey_snap_data(raw_df):
+    df = raw_df.copy()
+    return df[["GEO_ID", "S2201_C03_001E"]].rename({
+        "GEO_ID": "ucgid_str",
+        "S2201_C03_001E": "snap_household_ct"
+        }, axis=1
+    )[
+        ~df["GEO_ID"].isin(
+            [  # Puerto Rico's state and district
+                "0400000US72",
+                "5001800US7298",
+            ]
+        )
+    ]
+
+
 def load_administrative_snap_data(df_states, year):
 
     DATABASE_URL = "sqlite:///policy_data.db"
@@ -211,7 +174,8 @@ def load_administrative_snap_data(df_states, year):
             value="0",
         ),
     ]
-    # No target at the national level is provided at this time.
+    # No target at the national level is provided at this time. Keeping it
+    # so that the state strata can have a parent stratum
 
     session.add(nat_stratum)
     session.flush()
@@ -266,40 +230,7 @@ def load_administrative_snap_data(df_states, year):
     return stratum_lookup
 
 
-# Survey data ------------------------------------------------------
-
-def extract_survey_snap_data(year):
-
-    raw_dfs = {}
-    for geo in ["District", "State", "National"]:
-        df = pull_acs_table("S2201", geo, year)
-        raw_dfs[geo] = df
-
-    return raw_dfs
-
-
-def transform_survey_snap_data(raw_dfs):
-
-    dfs = {}
-    for geo in raw_dfs.keys():
-        df = raw_dfs[geo] 
-        dfs[geo] = df_data = df[["GEO_ID", "S2201_C03_001E"]].rename({
-            "GEO_ID": "ucgid_str",
-            "S2201_C03_001E": "snap_household_ct"
-            }, axis=1
-        )[
-            ~df["GEO_ID"].isin(
-                [  # Puerto Rico's state and district
-                    "0400000US72",
-                    "5001800US7298",
-                ]
-            )
-        ].copy()
-
-    return dfs
-
-
-def load_survey_snap_data(survey_dfs, year, stratum_lookup ={}):
+def load_survey_snap_data(survey_df, year, stratum_lookup ={}):
     """Use an already defined stratum_lookup to load the survey SNAP data"""
 
     DATABASE_URL = "sqlite:///policy_data.db"
@@ -308,43 +239,8 @@ def load_survey_snap_data(survey_dfs, year, stratum_lookup ={}):
     Session = sessionmaker(bind=engine)
     session = Session()
 
-    # National. Use the stratum from the administrative data function
-    nat_df = survey_dfs["National"]
-    nat_stratum = session.get(Stratum, stratum_lookup["National"])
-
-    nat_stratum.targets_rel.append(
-        Target(
-            variable="household_count",
-            period=year,
-            value=nat_df["snap_household_ct"],
-            source_id=4,
-            active=True,
-        )
-    )
-    session.add(nat_stratum)
-    session.flush()
-
-    # Skipping state for now, but 
-    # # State. Also use the stratum from the administrative data function
-    # state_df = survey_dfs["State"]
-    # for _, row in state_df.iterrows():
-    #     print(row)
-    #     state_stratum = session.get(Stratum, stratum_lookup["State"][row["ucgid_str"]])
-
-    #     state_stratum.targets_rel.append(
-    #         Target(
-    #             variable="household_count",
-    #             period=year,
-    #             value=row["snap_household_ct"],
-    #             source_id=4,
-    #             active=True,
-    #         )
-    #     )
-    #     session.add(state_stratum)
-    #     session.flush()
-
-    # You will need to create new strata for districts
-    district_df = survey_dfs["District"]
+    # Create new strata for districts whose households recieve SNAP benefits
+    district_df = survey_df.copy()
     for _, row in district_df.iterrows():
         note = f"Geo: {row['ucgid_str']} Received SNAP Benefits"
         state_ucgid_str = '0400000US' + row['ucgid_str'][9:11]
@@ -387,15 +283,15 @@ def main():
 
     # Extract ---------
     zip_file_admin = extract_administrative_snap_data()
-    raw_survey_dfs = extract_survey_snap_data(year)
+    raw_survey_df = extract_survey_snap_data(year)
 
     # Transform -------
     state_admin_df = transform_administrative_snap_data(zip_file_admin, year)
-    survey_dfs = transform_survey_snap_data(raw_survey_dfs)
+    district_survey_df = transform_survey_snap_data(raw_survey_df)
 
     # Load -----------
     stratum_lookup = load_administrative_snap_data(state_admin_df, year)
-    load_survey_snap_data(survey_dfs, year, stratum_lookup)
+    load_survey_snap_data(district_survey_df, year, stratum_lookup)
 
 
 if __name__ == "__main__":
diff --git a/policyengine_us_data/db/temp.py b/policyengine_us_data/db/temp.py
new file mode 100644
index 00000000..6026ace0
--- /dev/null
+++ b/policyengine_us_data/db/temp.py
@@ -0,0 +1,57 @@
+# ucgid_str converts the household’s ucgid enumeration into a comma‑separated string of all hierarchical UCGID codes.
+from policyengine_us import Simulation
+from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID
+
+# Minimal one-household simulation
+sim = Simulation(
+    situation={
+        "people": {"p1": {}},
+        "households": {"h1": {"members": ["p1"]}},
+    }
+)
+
+# Assign a specific UCGID (California district 23)
+sim.set_input("ucgid", 2024, UCGID.CA_23)
+
+# Use the ucgid_str Variable's formula
+ucgid_str_val = sim.calculate("ucgid_str", 2024)
+print(ucgid_str_val)
+# ['5001800US0623,0400000US06,0100000US']
+
+
+# First, let's explore UCGID, the enum, and how it can create the hierarchy
+
+import pandas as pd
+from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID
+
+rows = []
+for node in UCGID:
+    codes = node.get_hierarchical_codes()
+    rows.append({
+        "name":   node.name,
+        "code":   codes[0],
+        "parent": codes[1] if len(codes) > 1 else None
+    })
+
+hierarchy_df = (
+    pd.DataFrame(rows)
+      .sort_values(["parent", "code"], na_position="first")
+      .reset_index(drop=True)
+)
+
+print(hierarchy_df)
+#Out[262]: 
+#      name           code       parent
+#0       US      0100000US         None
+#1       AL    0400000US01    0100000US
+#2       AK    0400000US02    0100000US
+#3       AZ    0400000US04    0100000US
+#4       AR    0400000US05    0100000US
+#..     ...            ...          ...
+#483  WI_05  5001800US5505  0400000US55
+#484  WI_06  5001800US5506  0400000US55
+#485  WI_07  5001800US5507  0400000US55
+#486  WI_08  5001800US5508  0400000US55
+#487  WY_01  5001800US5600  0400000US56
+#
+#[488 rows x 3 columns]
diff --git a/policyengine_us_data/storage/calibration_targets/make_district_mapping.py b/policyengine_us_data/storage/calibration_targets/make_district_mapping.py
new file mode 100644
index 00000000..cc3f50fb
--- /dev/null
+++ b/policyengine_us_data/storage/calibration_targets/make_district_mapping.py
@@ -0,0 +1,254 @@
+"""
+This was built before finding out about the crosswalk provided by the
+Missouri Census Data Center (MCDC) at the University of Missouri. This crosswalk can be
+accessed at (https://mcdc.missouri.edu/applications/geocorr.html) and would be a logical place
+to transition to, though since this is already built and new IRS SOI files may be available soon,
+it may not be worth the effort to transition.
+
+To see the definitive "before and after" of congressional redistricting following the 2020 census,
+you should compare the block-level data from the 116th Congress to the 119th Congress.
+
+This approach is necessary for states whose initial redistricting maps were altered due to legal
+challenges and is aligned with the mapping files provided by the U.S. Census Bureau.
+
+- **116th Congress (The "Before"):** This session (2019-2021) used the congressional maps
+based on the 2010 census data. It serves as the stable pre-redistricting baseline, as these
+maps were identical to those used by the 117th Congress. The Census Bureau's most recent files
+for that decade correspond to the 116th Congress.
+
+- **118th Congress (The "Interim" Stage):** In several states, the initial congressional maps drawn
+for the 2022 elections were successfully challenged and invalidated by courts (e.g., for reasons of
+partisan or racial gerrymandering). This required the use of temporary, court-ordered, or remedial
+maps for the 2022 elections. Consequently, the 118th Congress (2023-2025) in these states represents
+an interim stage, not the final outcome of the redistricting cycle.
+
+- **119th Congress (The Definitive "After"):** Following these legal resolutions, new and more permanent
+congressional maps were enacted ahead of the 2024 election cycle. The elections in November 2024 were
+the first to use these new maps. Therefore, the 119th Congress (2025-2027) is the first to reflect the
+final, settled mapping decisions based on the 2020 census data.
+
+By comparing the 116th and 119th Congresses, you bypass the anomalous, non-final maps of the 118th Congress,
+providing a clear analysis of the redistricting cycle's ultimate impact.
+"""
+
+import requests
+import zipfile
+import io
+from pathlib import Path
+
+import pandas as pd
+import numpy as np
+import us
+
+from policyengine_us_data.storage import STORAGE_FOLDER, CALIBRATION_FOLDER
+
+
+def fetch_block_to_district_map(congress: int) -> pd.DataFrame:
+    """
+    Fetches the Census Block Equivalency File (BEF) for a given Congress.
+
+    This file maps every 2020 census block (GEOID) to its corresponding
+    congressional district.
+
+    Args:
+        congress: The congressional session number (e.g., 118 or 119).
+
+    Returns:
+        A DataFrame with columns ['GEOID', f'CD{congress}'].
+    """
+    if congress == 116:
+        url = "https://www2.census.gov/programs-surveys/decennial/rdo/mapping-files/2019/116-congressional-district-bef/cd116.zip"
+        zbytes = requests.get(url, timeout=120).content
+
+        with zipfile.ZipFile(io.BytesIO(zbytes)) as z:
+            fname = "National_CD116.txt"
+            bef = pd.read_csv(z.open(fname), dtype=str)
+            bef.columns = bef.columns.str.strip()
+            bef = bef.rename(columns={"BLOCKID": "GEOID"})
+            return bef[["GEOID", f"CD{congress}"]]
+
+    elif congress == 118:
+        url = "https://www2.census.gov/programs-surveys/decennial/rdo/mapping-files/2023/118-congressional-district-bef/cd118.zip"
+        zbytes = requests.get(url, timeout=120).content
+
+        with zipfile.ZipFile(io.BytesIO(zbytes)) as z:
+            fname = "National_CD118.txt"
+            bef = pd.read_csv(z.open(fname), dtype=str)
+            bef.columns = bef.columns.str.strip()
+            district_col = [c for c in bef.columns if c != "GEOID"][0]
+            bef = bef.rename(columns={district_col: f"CD{congress}"})
+            return bef[["GEOID", f"CD{congress}"]]
+
+    elif congress == 119:
+        url = "https://www2.census.gov/programs-surveys/decennial/rdo/mapping-files/2025/119-congressional-district-befs/cd119.zip"
+        zbytes = requests.get(url, timeout=120).content
+
+        with zipfile.ZipFile(io.BytesIO(zbytes)) as z:
+            fname = "NationalCD119.txt"
+            bef = pd.read_csv(z.open(fname), sep=",", dtype=str)
+            bef.columns = bef.columns.str.strip()
+            bef = bef.rename(columns={"CDFP": f"CD{congress}"})
+            return bef[["GEOID", f"CD{congress}"]]
+
+    else:
+        raise ValueError(
+            f"Congress {congress} is not supported by this function."
+        )
+
+
+def fetch_block_population(state) -> pd.DataFrame:
+    """
+    Download & parse the 2020 PL-94-171 “legacy” files for one state.
+
+    Parameters
+    ----------
+    state : str
+        Two-letter state/territory postal code **or** full state name
+        (e.g., "GA", "Georgia", "PR", "Puerto Rico").
+
+    Returns
+    -------
+    pandas.DataFrame with columns GEOID (15-digit block code) and POP20.
+    """
+    BASE = (
+        "https://www2.census.gov/programs-surveys/decennial/2020/data/"
+        "01-Redistricting_File--PL_94-171/{dir}/{abbr}2020.pl.zip"
+    )
+    st = us.states.lookup(state)
+    if st is None:
+        raise ValueError(f"Unrecognised state name/abbr: {state}")
+
+    # Build URL components -----------------------------------------------------
+    dir_name = st.name.replace(" ", "_")
+    abbr = st.abbr.lower()
+    url = BASE.format(dir=dir_name, abbr=abbr)
+
+    # Download and open the zip ------------------------------------------------
+    zbytes = requests.get(url, timeout=120).content
+    with zipfile.ZipFile(io.BytesIO(zbytes)) as z:
+        raw = z.read(f"{abbr}geo2020.pl")
+        try:
+            geo_lines = raw.decode("utf-8").splitlines()
+        except UnicodeDecodeError:
+            geo_lines = raw.decode("latin-1").splitlines()
+
+        p1_lines = z.read(f"{abbr}000012020.pl").decode("utf-8").splitlines()
+
+    # ---------------- GEO file: keep blocks (SUMLEV 750) ----------------------
+    geo_records = [
+        (parts[7], parts[8][-15:])  # LOGRECNO, 15-digit block GEOID
+        for ln in geo_lines
+        if (parts := ln.split("|"))[2] == "750"  # summary level 750 = blocks
+    ]
+    geo_df = pd.DataFrame(geo_records, columns=["LOGRECNO", "GEOID"])
+
+    # ---------------- P-file: pull total-population cell ----------------------
+    p1_records = [
+        (p[4], int(p[5])) for p in map(lambda x: x.split("|"), p1_lines)
+    ]
+    p1_df = pd.DataFrame(p1_records, columns=["LOGRECNO", "P0010001"])
+
+    # ---------------- Merge & finish -----------------------------------------
+    return (
+        geo_df.merge(p1_df, on="LOGRECNO", how="left")
+        .assign(POP20=lambda d: d["P0010001"].fillna(0).astype(int))
+        .loc[:, ["GEOID", "POP20"]]
+        .sort_values("GEOID")
+        .reset_index(drop=True)
+    )
+
+
+def build_crosswalk_cd116_to_cd119():
+    """Builds the crosswalk between 116th and 119th congress"""
+    # Pull the census block level population data one state at a time
+    state_pops = []
+    for s in us.states.STATES_AND_TERRITORIES:
+        if not s.is_territory and s.abbr not in ["DC", "ZZ"]:
+            print(s.name)
+            state_pops.append(fetch_block_population(s.abbr))
+    block_pop_df = pd.concat(state_pops)
+
+    # Get census blocks for each district under the 116th and 119th congress
+    # Remove 'ZZ': blocks not assigned to any congressional district
+    df116 = fetch_block_to_district_map(116)
+    df116 = df116.loc[df116["CD116"] != "ZZ"]
+    df119 = fetch_block_to_district_map(119)
+    df119 = df119.loc[df119["CD119"] != "ZZ"]
+
+    common_blocks = df116.merge(df119, on="GEOID")
+
+    block_stats = block_pop_df.merge(common_blocks, on="GEOID")
+    block_stats["state_fips"] = block_stats.GEOID.str[:2]
+    shares = (
+        block_stats.groupby(["state_fips", "CD116", "CD119"])["POP20"]
+        .sum()
+        .rename("pop_shared")
+        .reset_index()
+    )
+
+    def make_cd_code(state, district):
+        return f"5001800US{str(state).zfill(2)}{str(district).zfill(2)}"
+
+    shares["code_old"] = shares.apply(
+        lambda row: make_cd_code(row.state_fips, row.CD116), axis=1
+    )
+    shares["code_new"] = shares.apply(
+        lambda row: make_cd_code(row.state_fips, row.CD119), axis=1
+    )
+    shares["proportion"] = shares.groupby("code_old").pop_shared.transform(
+        lambda s: s / s.sum()
+    )
+
+    ## add DC's district
+    dc_row = pd.DataFrame(
+        {
+            "state_fips": ["11"],  # DC's FIPS
+            "CD116": ["98"],  # at-large code in the BEF files
+            "CD119": ["98"],
+            "pop_shared": [689545],
+            "code_old": ["5001800US1198"],
+            "code_new": ["5001800US1198"],
+            "proportion": [1.0],
+        }
+    )
+
+    shares = pd.concat([shares, dc_row], ignore_index=True)
+
+    district_mapping = (
+        shares[["code_old", "code_new", "proportion"]]
+        .sort_values(["code_old", "proportion"], ascending=[True, False])
+        .reset_index(drop=True)
+    )
+    assert len(set(district_mapping.code_old)) == 436
+    assert len(set(district_mapping.code_new)) == 436
+    mapping_path = Path(STORAGE_FOLDER, "district_mapping.csv")
+    district_mapping.to_csv(mapping_path, index=False)
+
+
+def get_district_mapping():
+    """Puts the 436 by 436 - with DC - (old by new) district mapping matrix into memory"""
+
+    mapping_path = Path(STORAGE_FOLDER, "district_mapping.csv")
+    mapping_df = pd.read_csv(mapping_path)
+
+    old_codes = sorted(mapping_df.code_old.unique())
+    new_codes = sorted(mapping_df.code_new.unique())
+    assert len(old_codes) == len(new_codes) == 436
+
+    old_index = {c: i for i, c in enumerate(old_codes)}
+    new_index = {c: j for j, c in enumerate(new_codes)}
+
+    mapping_matrix = np.zeros((436, 436), dtype=float)
+
+    for row in mapping_df.itertuples(index=False):
+        i = old_index[row.code_old]
+        j = new_index[row.code_new]
+        mapping_matrix[i, j] = row.proportion
+
+    assert np.allclose(mapping_matrix.sum(axis=1), 1.0)
+    return {'mapping_matrix': mapping_matrix, 'old_codes': old_codes, 'new_codes': new_codes}
+
+
+if __name__ == "__main__":
+    build_crosswalk_cd116_to_cd119()
+    print(get_district_mapping_matrix())
diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py
index 69a475fb..018cb6a7 100644
--- a/policyengine_us_data/utils/census.py
+++ b/policyengine_us_data/utils/census.py
@@ -59,6 +59,29 @@
     "Wyoming": "56",
 }
 
+STATE_ABBREV_TO_FIPS = {
+    'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06',
+    'CO': '08', 'CT': '09', 'DE': '10', 'FL': '12', 'GA': '13',
+    'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19',
+    'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24',
+    'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28', 'MO': '29',
+    'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34',
+    'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39',
+    'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45',
+    'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50',
+    'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55', 'WY': '56',
+    'DC': '11'
+}
+
+TERRITORY_UCGIDS = {
+    "0400000US72",  # Puerto Rico (state level)
+    "5001800US7298",  # Puerto Rico
+    "5001800US6098",  # American Samoa
+    "5001800US6698",  # Guam
+    "5001800US6998",  # Northern Mariana Islands
+    "5001800US7898",  # U.S. Virgin Islands
+}
+
 
 def get_census_docs(year):
     docs_url = (
diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py
new file mode 100644
index 00000000..bb484fab
--- /dev/null
+++ b/policyengine_us_data/utils/db.py
@@ -0,0 +1,61 @@
+from typing import List, Optional
+
+from sqlmodel import Session, select
+import sqlalchemy as sa
+
+from policyengine_us_data.db.create_database_tables import Stratum, StratumConstraint
+
+
+def get_stratum_by_id(session: Session, stratum_id: int) -> Optional[Stratum]:
+    """Retrieves a single Stratum by its primary key"""
+    return session.get(Stratum, stratum_id)
+
+
+def get_simple_stratum_by_ucgid(session: Session, ucgid: str) -> Optional[Stratum]:
+    """
+    Finds a stratum defined *only* by a single ucgid_str constraint.
+    """
+    constraint_count_subquery = (
+        select(
+            StratumConstraint.stratum_id,
+            sa.func.count(StratumConstraint.stratum_id).label("constraint_count")
+        )
+        .group_by(StratumConstraint.stratum_id)
+        .subquery()
+    )
+
+    statement = (
+        select(Stratum)
+        .join(StratumConstraint)
+        .join(
+            constraint_count_subquery,
+            Stratum.stratum_id == constraint_count_subquery.c.stratum_id
+        )
+        .where(StratumConstraint.constraint_variable == "ucgid_str")
+        .where(StratumConstraint.value == ucgid)
+        .where(constraint_count_subquery.c.constraint_count == 1)
+    )
+
+    return session.exec(statement).first()
+
+
+def get_root_strata(session: Session) -> List[Stratum]:
+    """Finds all strata that do not have a parent"""
+    statement = select(Stratum).where(Stratum.parent_stratum_id == None)
+    return session.exec(statement).all()
+
+
+def get_stratum_children(session: Session, stratum_id: int) -> List[Stratum]:
+    """Retrieves all direct children for a given stratum"""
+    parent_stratum = get_stratum_by_id(session, stratum_id)
+    if parent_stratum:
+        return parent_stratum.children_rel
+    return []
+
+
+def get_stratum_parent(session: Session, stratum_id: int) -> Optional[Stratum]:
+    """Retrieves the direct parent for a given stratum."""
+    child_stratum = get_stratum_by_id(session, stratum_id)
+    if child_stratum:
+        return child_stratum.parent_rel
+    return None

From dddf6891c61ee4fb4ccb84ef0e9f1d49636ebb3f Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Thu, 14 Aug 2025 18:57:31 -0400
Subject: [PATCH 13/27] linting

---
 .../db/create_initial_strata.py               |  41 +-
 policyengine_us_data/db/etl_age.py            |  10 +-
 policyengine_us_data/db/etl_irs_soi.py        | 362 ++++++++++--------
 policyengine_us_data/db/etl_medicaid.py       |  61 ++-
 policyengine_us_data/db/etl_snap.py           |  22 +-
 policyengine_us_data/db/temp.py               |  50 +--
 .../make_district_mapping.py                  |   6 +-
 policyengine_us_data/utils/census.py          |  68 +++-
 policyengine_us_data/utils/db.py              |  15 +-
 9 files changed, 381 insertions(+), 254 deletions(-)

diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py
index 0a7e7f7a..a2a333df 100644
--- a/policyengine_us_data/db/create_initial_strata.py
+++ b/policyengine_us_data/db/create_initial_strata.py
@@ -6,31 +6,33 @@
 from sqlmodel import SQLModel, Session, select
 
 
-from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID
+from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import (
+    UCGID,
+)
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
 )
 
 
-
 def main():
-    # Get the implied hierarchy by the UCGID enum -------- 
+    # Get the implied hierarchy by the UCGID enum --------
     rows = []
     for node in UCGID:
         codes = node.get_hierarchical_codes()
-        rows.append({
-            "name":   node.name,
-            "code":   codes[0],
-            "parent": codes[1] if len(codes) > 1 else None
-        })
-    
+        rows.append(
+            {
+                "name": node.name,
+                "code": codes[0],
+                "parent": codes[1] if len(codes) > 1 else None,
+            }
+        )
+
     hierarchy_df = (
         pd.DataFrame(rows)
-          .sort_values(["parent", "code"], na_position="first")
-          .reset_index(drop=True)
+        .sort_values(["parent", "code"], na_position="first")
+        .reset_index(drop=True)
     )
-    
 
     DATABASE_URL = "sqlite:///policy_data.db"
     engine = create_engine(DATABASE_URL)
@@ -40,11 +42,13 @@ def main():
 
     # map the ucgid_str 'code' to auto-generated 'stratum_id'
     code_to_stratum_id: Dict[str, int] = {}
-    
+
     for _, row in hierarchy_df.iterrows():
         parent_code = row["parent"]
-        
-        parent_id = code_to_stratum_id.get(parent_code) if parent_code else None
+
+        parent_id = (
+            code_to_stratum_id.get(parent_code) if parent_code else None
+        )
 
         new_stratum = Stratum(
             parent_stratum_id=parent_id,
@@ -59,14 +63,15 @@ def main():
                 value=row["code"],
             )
         ]
-        
+
         session.add(new_stratum)
-        
+
         session.flush()
-        
+
         code_to_stratum_id[row["code"]] = new_stratum.stratum_id
 
     session.commit()
 
+
 if __name__ == "__main__":
     main()
diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py
index 084a43d6..7bb36ed4 100644
--- a/policyengine_us_data/db/etl_age.py
+++ b/policyengine_us_data/db/etl_age.py
@@ -78,11 +78,11 @@ def transform_age_data(age_data, docs):
     )
     age_bounds = df_long["age_range"].str.split("-", expand=True).astype(int)
     age_bounds.columns = ["ge", "le"]
-    age_bounds[['gt']] = age_bounds[["ge"]] - 1
-    age_bounds[['lt']] = age_bounds[["le"]] + 1
+    age_bounds[["gt"]] = age_bounds[["ge"]] - 1
+    age_bounds[["lt"]] = age_bounds[["le"]] + 1
 
     df_long["age_greater_than"] = age_bounds[["gt"]]
-    df_long["age_less_than"] = age_bounds[["lt"]] 
+    df_long["age_less_than"] = age_bounds[["lt"]]
     df_long["variable"] = "person_count"
     df_long["reform_id"] = 0
     df_long["source_id"] = 1
@@ -208,6 +208,4 @@ def load_age_data(df_long, geo, year, stratum_lookup={}):
     state_strata_lku = load_age_data(
         long_state_df, "State", year, national_strata_lku
     )
-    load_age_data(
-        long_district_df, "District", year, state_strata_lku
-    )
+    load_age_data(long_district_df, "District", year, state_strata_lku)
diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
index c93eb593..a4a07cfe 100644
--- a/policyengine_us_data/db/etl_irs_soi.py
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -10,9 +10,17 @@
     StratumConstraint,
     Target,
 )
-from policyengine_us_data.utils.db import get_stratum_by_id, get_simple_stratum_by_ucgid, get_root_strata, get_stratum_children, get_stratum_parent
+from policyengine_us_data.utils.db import (
+    get_stratum_by_id,
+    get_simple_stratum_by_ucgid,
+    get_root_strata,
+    get_stratum_children,
+    get_stratum_parent,
+)
 from policyengine_us_data.utils.census import TERRITORY_UCGIDS
-from policyengine_us_data.storage.calibration_targets.make_district_mapping import get_district_mapping
+from policyengine_us_data.storage.calibration_targets.make_district_mapping import (
+    get_district_mapping,
+)
 
 
 """See the 22incddocguide.docx manual from the IRS SOI"""
@@ -22,7 +30,7 @@
 AGI_STUB_TO_INCOME_RANGE = {
     1: (-np.inf, 1),
     2: (1 - epsilon, 10_000),
-    3: (10_000 - epsilon , 25_000),
+    3: (10_000 - epsilon, 25_000),
     4: (25_000 - epsilon, 50_000),
     5: (50_000 - epsilon, 75_000),
     6: (75_000 - epsilon, 100_000),
@@ -31,12 +39,13 @@
     9: (500_000 - epsilon, np.inf),
 }
 
+
 def create_records(df, breakdown_variable, target_variable):
     """Transforms a DataFrame subset into a standardized list of records."""
     temp_df = df[["ucgid_str"]].copy()
-    temp_df["breakdown_variable"] = breakdown_variable 
+    temp_df["breakdown_variable"] = breakdown_variable
     temp_df["breakdown_value"] = df[breakdown_variable]
-    temp_df["target_variable"] = target_variable 
+    temp_df["target_variable"] = target_variable
     temp_df["target_value"] = df[target_variable]
     return temp_df
 
@@ -50,18 +59,15 @@ def make_records(
     breakdown_col: Optional[str] = None,
     multiplier: int = 1_000,
 ):
-    df = (
-        df.rename({count_col: "tax_unit_count",
-                   amount_col: amount_name},
-                  axis=1)
-          .copy()
-    )
+    df = df.rename(
+        {count_col: "tax_unit_count", amount_col: amount_name}, axis=1
+    ).copy()
 
     if breakdown_col is None:
         breakdown_col = "one"
         df[breakdown_col] = 1
 
-    rec_counts  = create_records(df, breakdown_col, "tax_unit_count")
+    rec_counts = create_records(df, breakdown_col, "tax_unit_count")
     rec_amounts = create_records(df, breakdown_col, amount_name)
     rec_amounts["target_value"] *= multiplier  # Only the amounts get * 1000
     rec_counts["target_variable"] = f"{amount_name}_tax_unit_count"
@@ -72,57 +78,64 @@ def make_records(
 def make_agi_long(df: pd.DataFrame) -> pd.DataFrame:
     """Convert IRS SOI AGI‑split table from wide to the long format used"""
     target_col_map = {
-        "N1":     "agi_tax_unit_count",
-        "N2":     "agi_person_count",
+        "N1": "agi_tax_unit_count",
+        "N2": "agi_person_count",
         "A00100": "agi_total_amount",
     }
-    work = (
-        df[["ucgid_str", "agi_stub"] + list(target_col_map)]
-          .rename(columns=target_col_map)
+    work = df[["ucgid_str", "agi_stub"] + list(target_col_map)].rename(
+        columns=target_col_map
     )
     long = (
         work.melt(
             id_vars=["ucgid_str", "agi_stub"],
             var_name="target_variable",
-            value_name="target_value"
+            value_name="target_value",
         )
         .rename(columns={"agi_stub": "breakdown_value"})
         .assign(breakdown_variable="agi_stub")
     )
-    long = long[["ucgid_str",
-                 "breakdown_variable",
-                 "breakdown_value",
-                 "target_variable",
-                 "target_value"]]
+    long = long[
+        [
+            "ucgid_str",
+            "breakdown_variable",
+            "breakdown_value",
+            "target_variable",
+            "target_value",
+        ]
+    ]
 
     return [
-        df.sort_values(by='ucgid_str').reset_index(drop=True)
-        for name, df in long.groupby(['breakdown_value', 'target_variable'])
+        df.sort_values(by="ucgid_str").reset_index(drop=True)
+        for name, df in long.groupby(["breakdown_value", "target_variable"])
     ]
 
 
 def convert_district_data(
     input_df: pd.DataFrame,
     mapping_matrix: np.ndarray,  # 436 x 436A
-    new_district_codes
+    new_district_codes,
 ) -> pd.DataFrame:
     """Transforms data from pre- to post- 2020 census districts"""
     df = input_df.copy()
-    old_districts_df = df[df['ucgid_str'].str.startswith("5001800US")].copy()
-    old_districts_df = old_districts_df.sort_values('ucgid_str').reset_index(drop=True)
-    old_values = old_districts_df['target_value'].to_numpy()
+    old_districts_df = df[df["ucgid_str"].str.startswith("5001800US")].copy()
+    old_districts_df = old_districts_df.sort_values("ucgid_str").reset_index(
+        drop=True
+    )
+    old_values = old_districts_df["target_value"].to_numpy()
     new_values = mapping_matrix.T @ old_values
 
     # Create a new DataFrame for the transformed data, preserving the original schema.
-    new_districts_df = pd.DataFrame({
-        'ucgid_str': new_district_codes,
-        'breakdown_variable': old_districts_df['breakdown_variable'],
-        'breakdown_value': old_districts_df['breakdown_value'],
-        'target_variable': old_districts_df['target_variable'],
-        'target_value': new_values
-    })
+    new_districts_df = pd.DataFrame(
+        {
+            "ucgid_str": new_district_codes,
+            "breakdown_variable": old_districts_df["breakdown_variable"],
+            "breakdown_value": old_districts_df["breakdown_value"],
+            "target_variable": old_districts_df["target_variable"],
+            "target_value": new_values,
+        }
+    )
 
-    other_geos_df = df[~df['ucgid_str'].str.startswith("5001800US")].copy()
+    other_geos_df = df[~df["ucgid_str"].str.startswith("5001800US")].copy()
 
     final_df = pd.concat([other_geos_df, new_districts_df], ignore_index=True)
 
@@ -153,88 +166,96 @@ def transform_soi_data(raw_df):
         dict(code="00400", name="tax_exempt_interest", breakdown=None),
         dict(code="00600", name="oridinary_dividends", breakdown=None),
         dict(code="00650", name="qualified_dividends", breakdown=None),
-        dict(code="26270", name="partnership_and_s_crop_net_income", breakdown=None),
+        dict(
+            code="26270",
+            name="partnership_and_s_crop_net_income",
+            breakdown=None,
+        ),
         dict(code="02500", name="total_social_security", breakdown=None),
         dict(code="01700", name="pension_and_annuities", breakdown=None),
         dict(code="02300", name="unemployment_compensation", breakdown=None),
         dict(code="00900", name="business_net_income", breakdown=None),
-        dict(code="17000", name="medical_and_dental_deduction", breakdown=None),
+        dict(
+            code="17000", name="medical_and_dental_deduction", breakdown=None
+        ),
         dict(code="00700", name="salt_refunds", breakdown=None),
         dict(code="18425", name="salt_amount", breakdown=None),
         dict(code="06500", name="income_tax", breakdown=None),
     ]
 
     # National ---------------
-    national_df = raw_df.copy().loc[
-        (raw_df.STATE == "US")
-    ]
+    national_df = raw_df.copy().loc[(raw_df.STATE == "US")]
     national_df["ucgid_str"] = "0100000US"
 
     # State -------------------
     # You've got agi_stub == 0 in here, which you want to use any time you don't want to
     # break things up by AGI
     state_df = raw_df.copy().loc[
-        (raw_df.STATE != "US") &
-        (raw_df.CONG_DISTRICT == 0)
+        (raw_df.STATE != "US") & (raw_df.CONG_DISTRICT == 0)
     ]
-    state_df["ucgid_str"] = "0400000US" + state_df["STATEFIPS"].astype(str).str.zfill(2)
+    state_df["ucgid_str"] = "0400000US" + state_df["STATEFIPS"].astype(
+        str
+    ).str.zfill(2)
 
     # District ------------------
     # This is going to fail because we're missing the single cong district states
-    district_df = raw_df.copy().loc[
-        (raw_df.CONG_DISTRICT > 0)
-    ]
+    district_df = raw_df.copy().loc[(raw_df.CONG_DISTRICT > 0)]
 
-    max_cong_district_by_state = raw_df.groupby('STATE')['CONG_DISTRICT'].transform('max')
+    max_cong_district_by_state = raw_df.groupby("STATE")[
+        "CONG_DISTRICT"
+    ].transform("max")
     district_df = raw_df.copy().loc[
-        (raw_df['CONG_DISTRICT'] > 0) | (max_cong_district_by_state == 0)
+        (raw_df["CONG_DISTRICT"] > 0) | (max_cong_district_by_state == 0)
     ]
-    district_df = district_df.loc[district_df['STATE'] != 'US']
-    district_df["STATEFIPS"] = district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2)
+    district_df = district_df.loc[district_df["STATE"] != "US"]
+    district_df["STATEFIPS"] = (
+        district_df["STATEFIPS"].astype(int).astype(str).str.zfill(2)
+    )
     district_df["CONG_DISTRICT"] = (
         district_df["CONG_DISTRICT"].astype(int).astype(str).str.zfill(2)
     )
-    district_df["ucgid_str"] = "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"]
+    district_df["ucgid_str"] = (
+        "5001800US" + district_df["STATEFIPS"] + district_df["CONG_DISTRICT"]
+    )
     district_df = district_df[~district_df["ucgid_str"].isin(TERRITORY_UCGIDS)]
 
     assert district_df.shape[0] % 436 == 0
 
     all_df = pd.concat([national_df, state_df, district_df])
 
-    # "Marginal" over AGI bands, which this data set is organized according to 
+    # "Marginal" over AGI bands, which this data set is organized according to
     all_marginals = all_df.copy().loc[all_df.agi_stub == 0]
     assert all_marginals.shape[0] == 436 + 51 + 1
 
     # Collect targets from the SOI file
     records = []
     for spec in TARGETS:
-        count_col  = f"N{spec['code']}"  # e.g. 'N59661'
+        count_col = f"N{spec['code']}"  # e.g. 'N59661'
         amount_col = f"A{spec['code']}"  # e.g. 'A59661'
-    
+
         df = all_marginals.copy()
-    
+
         if spec["breakdown"] is not None:
             col, val = spec["breakdown"]
             df[col] = val
             breakdown_col = col
         else:
             breakdown_col = None
-    
+
         rec_counts, rec_amounts = make_records(
             df,
-            count_col   = count_col,
-            amount_col  = amount_col,
-            amount_name = spec["name"],
-            breakdown_col = breakdown_col,
-            multiplier  = 1_000,
+            count_col=count_col,
+            amount_col=amount_col,
+            amount_name=spec["name"],
+            breakdown_col=breakdown_col,
+            multiplier=1_000,
         )
         records.extend([rec_counts, rec_amounts])
 
-
     # AGI Processing (separate, doesn't have a count column)
     temp_df = df[["ucgid_str"]].copy()
-    temp_df["breakdown_variable"] = "one" 
-    temp_df["breakdown_value"] = 1 
+    temp_df["breakdown_variable"] = "one"
+    temp_df["breakdown_value"] = 1
     temp_df["target_variable"] = "agi"
     temp_df["target_value"] = df["A00100"] * 1_000
 
@@ -250,9 +271,14 @@ def transform_soi_data(raw_df):
 
     # Pre- to Post- 2020 Census redisticting
     mapping = get_district_mapping()
-    converted = [convert_district_data(r, mapping['mapping_matrix'], mapping['new_codes']) for r in records]
+    converted = [
+        convert_district_data(
+            r, mapping["mapping_matrix"], mapping["new_codes"]
+        )
+        for r in records
+    ]
 
-    return converted 
+    return converted
 
 
 def load_soi_data(long_dfs, year):
@@ -262,70 +288,74 @@ def load_soi_data(long_dfs, year):
 
     session = Session(engine)
 
-    # Load EITC data -------------------------------------------------------- 
+    # Load EITC data --------------------------------------------------------
     # Obviously this is not especially robust ---
-    eitc_data = {'0': (long_dfs[0], long_dfs[1]),
-                 '1': (long_dfs[2], long_dfs[3]),
-                 '2': (long_dfs[4], long_dfs[5]),
-                 '3+': (long_dfs[6], long_dfs[7])}
+    eitc_data = {
+        "0": (long_dfs[0], long_dfs[1]),
+        "1": (long_dfs[2], long_dfs[3]),
+        "2": (long_dfs[4], long_dfs[5]),
+        "3+": (long_dfs[6], long_dfs[7]),
+    }
 
     stratum_lookup = {"State": {}, "District": {}}
     for n_children in eitc_data.keys():
         eitc_count_i, eitc_amount_i = eitc_data[n_children]
         for i in range(eitc_count_i.shape[0]):
-            ucgid_i = eitc_count_i[['ucgid_str']].iloc[i].values[0]
+            ucgid_i = eitc_count_i[["ucgid_str"]].iloc[i].values[0]
             note = f"Geo: {ucgid_i}, EITC received with {n_children} children"
 
             if len(ucgid_i) == 9:  # National.
                 new_stratum = Stratum(
                     parent_stratum_id=None, stratum_group_id=0, notes=note
                 )
-            elif len(ucgid_i) == 11:  # State 
+            elif len(ucgid_i) == 11:  # State
                 new_stratum = Stratum(
                     parent_stratum_id=stratum_lookup["National"],
                     stratum_group_id=0,
-                    notes=note
+                    notes=note,
                 )
-            elif len(ucgid_i) == 13:  # District 
+            elif len(ucgid_i) == 13:  # District
                 new_stratum = Stratum(
-                    parent_stratum_id=stratum_lookup["State"]['0400000US' + ucgid_i[9:11]],
+                    parent_stratum_id=stratum_lookup["State"][
+                        "0400000US" + ucgid_i[9:11]
+                    ],
                     stratum_group_id=0,
-                    notes=note
+                    notes=note,
                 )
 
             new_stratum.constraints_rel = [
-               StratumConstraint(
-                   constraint_variable="ucgid_str",
-                   operation="in",
-                   value=ucgid_i,
-               ),
+                StratumConstraint(
+                    constraint_variable="ucgid_str",
+                    operation="in",
+                    value=ucgid_i,
+                ),
             ]
             if n_children == "3+":
-               new_stratum.constraints_rel.append(
-                   StratumConstraint(
-                       constraint_variable="eitc_children",
-                       operation="greater_than",
-                       value='2',
-                   )
-               )
+                new_stratum.constraints_rel.append(
+                    StratumConstraint(
+                        constraint_variable="eitc_children",
+                        operation="greater_than",
+                        value="2",
+                    )
+                )
             else:
-               new_stratum.constraints_rel.append(
-                   StratumConstraint(
-                       constraint_variable="eitc_children",
-                       operation="equals",
-                       value=f'{n_children}',
-                   )
-               )
+                new_stratum.constraints_rel.append(
+                    StratumConstraint(
+                        constraint_variable="eitc_children",
+                        operation="equals",
+                        value=f"{n_children}",
+                    )
+                )
 
             new_stratum.targets_rel = [
                 # It's already complex enough
-                #Target(
+                # Target(
                 #    variable="tax_unit_count",
                 #    period=year,
                 #    value=eitc_count_i.iloc[i][["target_value"]].values[0],
                 #    source_id=5,
                 #    active=True,
-                #),
+                # ),
                 Target(
                     variable="eitc",
                     period=year,
@@ -339,19 +369,21 @@ def load_soi_data(long_dfs, year):
             session.flush()
 
             if len(ucgid_i) == 9:
-                 stratum_lookup["National"] = new_stratum.stratum_id
-            elif len(ucgid_i) == 11: 
-                 stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id
+                stratum_lookup["National"] = new_stratum.stratum_id
+            elif len(ucgid_i) == 11:
+                stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id
 
     session.commit()
 
-    # No breakdown variables in this set 
+    # No breakdown variables in this set
     for j in range(8, 42, 2):
-        count_j, amount_j = long_dfs[j], long_dfs[j + 1] 
+        count_j, amount_j = long_dfs[j], long_dfs[j + 1]
         amount_variable_name = amount_j.iloc[0][["target_variable"]].values[0]
-        print(f"Loading amount data for IRS SOI data on {amount_variable_name}")
+        print(
+            f"Loading amount data for IRS SOI data on {amount_variable_name}"
+        )
         for i in range(count_j.shape[0]):
-            ucgid_i = count_j[['ucgid_str']].iloc[i].values[0]
+            ucgid_i = count_j[["ucgid_str"]].iloc[i].values[0]
 
             # Reusing an existing stratum this time, since there is no breakdown
             stratum = get_simple_stratum_by_ucgid(session, ucgid_i)
@@ -361,13 +393,13 @@ def load_soi_data(long_dfs, year):
                 # NOTE: If I do the counts, I'm going to need to explode the strata for the vars != 0
                 # OR, create new variables like qbid_tax_unit_count which requires adding stuff to -us
                 # AND, it's already complex enough -----
-                #Target(
+                # Target(
                 #    variable="tax_unit_count",
                 #    period=year,
                 #    value=count_j.iloc[i][["target_value"]].values[0],
                 #    source_id=5,
                 #    active=True,
-                #),
+                # ),
                 Target(
                     variable=amount_variable_name,
                     period=year,
@@ -382,11 +414,11 @@ def load_soi_data(long_dfs, year):
 
     session.commit()
 
-    # Adjusted Gross Income ------ 
+    # Adjusted Gross Income ------
     agi_values = long_dfs[42]
 
     for i in range(agi_values.shape[0]):
-        ucgid_i = agi_values[['ucgid_str']].iloc[i].values[0]
+        ucgid_i = agi_values[["ucgid_str"]].iloc[i].values[0]
         stratum = get_simple_stratum_by_ucgid(session, ucgid_i)
         stratum.targets_rel.append(
             Target(
@@ -399,10 +431,14 @@ def load_soi_data(long_dfs, year):
         )
         session.add(stratum)
         session.flush()
-    
+
     session.commit()
 
-    agi_person_count_dfs = [df for df in long_dfs[43:] if df['target_variable'].iloc[0] == 'agi_person_count']
+    agi_person_count_dfs = [
+        df
+        for df in long_dfs[43:]
+        if df["target_variable"].iloc[0] == "agi_person_count"
+    ]
 
     for agi_df in agi_person_count_dfs:
         agi_stub = agi_df.iloc[0][["breakdown_value"]].values[0]
@@ -413,62 +449,72 @@ def load_soi_data(long_dfs, year):
         nat_stratum = Stratum(
             parent_stratum_id=None, stratum_group_id=0, notes=note
         )
-        nat_stratum.constraints_rel.extend([
-           StratumConstraint(
-               constraint_variable="ucgid_str",
-               operation="in",
-               value=ucgid_i,
-           ),
-           StratumConstraint(
-               constraint_variable="agi",
-               operation="greater_than",
-               value=str(agi_income_lower),
-           ),
-           StratumConstraint(
-               constraint_variable="agi",
-               operation="less_than",
-               value=str(agi_income_upper),
-           ),
-        ])
+        nat_stratum.constraints_rel.extend(
+            [
+                StratumConstraint(
+                    constraint_variable="ucgid_str",
+                    operation="in",
+                    value=ucgid_i,
+                ),
+                StratumConstraint(
+                    constraint_variable="agi",
+                    operation="greater_than",
+                    value=str(agi_income_lower),
+                ),
+                StratumConstraint(
+                    constraint_variable="agi",
+                    operation="less_than",
+                    value=str(agi_income_upper),
+                ),
+            ]
+        )
         session.add(nat_stratum)
         session.flush()
- 
-        stratum_lookup = {"National": nat_stratum.stratum_id, "State": {}, "District": {}}
+
+        stratum_lookup = {
+            "National": nat_stratum.stratum_id,
+            "State": {},
+            "District": {},
+        }
         for i in range(agi_df.shape[0]):
-            ucgid_i = agi_df[['ucgid_str']].iloc[i].values[0]
+            ucgid_i = agi_df[["ucgid_str"]].iloc[i].values[0]
             note = f"Geo: {ucgid_i}, AGI > {agi_income_lower}, AGI < {agi_income_upper}"
 
             person_count = agi_df.iloc[i][["target_value"]].values[0]
 
-            if len(ucgid_i) == 11:  # State 
+            if len(ucgid_i) == 11:  # State
                 new_stratum = Stratum(
                     parent_stratum_id=stratum_lookup["National"],
                     stratum_group_id=0,
-                    notes=note
+                    notes=note,
                 )
-            elif len(ucgid_i) == 13:  # District 
+            elif len(ucgid_i) == 13:  # District
                 new_stratum = Stratum(
-                    parent_stratum_id=stratum_lookup["State"]['0400000US' + ucgid_i[9:11]],
+                    parent_stratum_id=stratum_lookup["State"][
+                        "0400000US" + ucgid_i[9:11]
+                    ],
                     stratum_group_id=0,
-                    notes=note
+                    notes=note,
                 )
-            new_stratum.constraints_rel.extend([
-               StratumConstraint(
-                   constraint_variable="ucgid_str",
-                   operation="in",
-                   value=ucgid_i,
-               ),
-               StratumConstraint(
-                   constraint_variable="agi",
-                   operation="greater_than",
-                   value=str(agi_income_lower),
-               ),
-               StratumConstraint(
-                   constraint_variable="agi",
-                   operation="less_than",
-                   value=str(agi_income_upper),
-               ),
-            ])
+            new_stratum.constraints_rel.extend(
+                [
+                    StratumConstraint(
+                        constraint_variable="ucgid_str",
+                        operation="in",
+                        value=ucgid_i,
+                    ),
+                    StratumConstraint(
+                        constraint_variable="agi",
+                        operation="greater_than",
+                        value=str(agi_income_lower),
+                    ),
+                    StratumConstraint(
+                        constraint_variable="agi",
+                        operation="less_than",
+                        value=str(agi_income_upper),
+                    ),
+                ]
+            )
             new_stratum.targets_rel = [
                 Target(
                     variable="person_count",
@@ -483,9 +529,9 @@ def load_soi_data(long_dfs, year):
             session.flush()
 
             if len(ucgid_i) == 9:
-                 stratum_lookup["National"] = new_stratum.stratum_id
-            elif len(ucgid_i) == 11: 
-                 stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id
+                stratum_lookup["National"] = new_stratum.stratum_id
+            elif len(ucgid_i) == 11:
+                stratum_lookup["State"][ucgid_i] = new_stratum.stratum_id
 
     session.commit()
 
diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py
index ec16ac71..3a5ab7d7 100644
--- a/policyengine_us_data/db/etl_medicaid.py
+++ b/policyengine_us_data/db/etl_medicaid.py
@@ -28,13 +28,13 @@ def extract_medicaid_data(year):
 
     item = "6165f45b-ca93-5bb5-9d06-db29c692a360"
     response = requests.get(
-      f"https://data.medicaid.gov/api/1/metastore/schemas/dataset/items/{item}?show-reference-ids=false"
+        f"https://data.medicaid.gov/api/1/metastore/schemas/dataset/items/{item}?show-reference-ids=false"
     )
     metadata = response.json()
-    
-    data_url = metadata['distribution'][0]['data']['downloadURL']
+
+    data_url = metadata["distribution"][0]["data"]["downloadURL"]
     state_admin_df = pd.read_csv(data_url)
-    
+
     return cd_survey_df, state_admin_df
 
 
@@ -43,27 +43,42 @@ def transform_medicaid_data(state_admin_df, cd_survey_df, year):
     reporting_period = year * 100 + 12
     print(f"Reporting period is {reporting_period}")
     state_df = state_admin_df.loc[
-        (state_admin_df["Reporting Period"] == reporting_period) &
-        (state_admin_df["Final Report"] == "Y"),
-        ["State Abbreviation", "Reporting Period", "Total Medicaid Enrollment"]
+        (state_admin_df["Reporting Period"] == reporting_period)
+        & (state_admin_df["Final Report"] == "Y"),
+        [
+            "State Abbreviation",
+            "Reporting Period",
+            "Total Medicaid Enrollment",
+        ],
     ]
 
     state_df["FIPS"] = state_df["State Abbreviation"].map(STATE_ABBREV_TO_FIPS)
 
-    cd_df = cd_survey_df[["GEO_ID", "state", "congressional district", "S2704_C02_006E"]]
+    cd_df = cd_survey_df[
+        ["GEO_ID", "state", "congressional district", "S2704_C02_006E"]
+    ]
 
     nc_cd_sum = cd_df.loc[cd_df.state == "37"].S2704_C02_006E.astype(int).sum()
-    nc_state_sum = state_df.loc[state_df.FIPS == '37']['Total Medicaid Enrollment'].values[0]
-    assert nc_cd_sum > .5 * nc_state_sum
+    nc_state_sum = state_df.loc[state_df.FIPS == "37"][
+        "Total Medicaid Enrollment"
+    ].values[0]
+    assert nc_cd_sum > 0.5 * nc_state_sum
     assert nc_cd_sum <= nc_state_sum
 
-    state_df = state_df.rename(columns={'Total Medicaid Enrollment': 'medicaid_enrollment'})
-    state_df['ucgid_str'] = '0400000US' + state_df['FIPS'].astype(str)
+    state_df = state_df.rename(
+        columns={"Total Medicaid Enrollment": "medicaid_enrollment"}
+    )
+    state_df["ucgid_str"] = "0400000US" + state_df["FIPS"].astype(str)
 
-    cd_df = cd_df.rename(columns={'S2704_C02_006E': 'medicaid_enrollment', 'GEO_ID': 'ucgid_str'})
-    cd_df = cd_df.loc[cd_df.state != '72']
+    cd_df = cd_df.rename(
+        columns={
+            "S2704_C02_006E": "medicaid_enrollment",
+            "GEO_ID": "ucgid_str",
+        }
+    )
+    cd_df = cd_df.loc[cd_df.state != "72"]
 
-    out_cols = ['ucgid_str', 'medicaid_enrollment']
+    out_cols = ["ucgid_str", "medicaid_enrollment"]
     return state_df[out_cols], cd_df[out_cols]
 
 
@@ -80,7 +95,9 @@ def load_medicaid_data(long_state, long_cd, year):
 
     # National ----------------
     nat_stratum = Stratum(
-        parent_stratum_id=None, stratum_group_id=0, notes="Geo: 0100000US Medicaid Enrolled"
+        parent_stratum_id=None,
+        stratum_group_id=0,
+        notes="Geo: 0100000US Medicaid Enrolled",
     )
     nat_stratum.constraints_rel = [
         StratumConstraint(
@@ -101,7 +118,7 @@ def load_medicaid_data(long_state, long_cd, year):
     stratum_lookup["National"] = nat_stratum.stratum_id
 
     # State -------------------
-    stratum_lookup["State"] = {} 
+    stratum_lookup["State"] = {}
     for _, row in long_state.iterrows():
 
         note = f"Geo: {row['ucgid_str']} Medicaid Enrolled"
@@ -133,13 +150,15 @@ def load_medicaid_data(long_state, long_cd, year):
         )
         session.add(new_stratum)
         session.flush()
-        stratum_lookup["State"][row['ucgid_str']] = new_stratum.stratum_id
+        stratum_lookup["State"][row["ucgid_str"]] = new_stratum.stratum_id
 
     # District -------------------
     for _, row in long_cd.iterrows():
 
         note = f"Geo: {row['ucgid_str']} Medicaid Enrolled"
-        parent_stratum_id = stratum_lookup["State"][f'0400000US{row["ucgid_str"][-4:-2]}']
+        parent_stratum_id = stratum_lookup["State"][
+            f'0400000US{row["ucgid_str"][-4:-2]}'
+        ]
 
         new_stratum = Stratum(
             parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note
@@ -179,7 +198,9 @@ def load_medicaid_data(long_state, long_cd, year):
     cd_survey_df, state_admin_df = extract_medicaid_data(year)
 
     # Transform -------------------
-    long_state, long_cd = transform_medicaid_data(state_admin_df, cd_survey_df, year)
+    long_state, long_cd = transform_medicaid_data(
+        state_admin_df, cd_survey_df, year
+    )
 
     # Load -----------------------
     load_medicaid_data(long_state, long_cd, year)
diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py
index f9a172a9..fb110025 100644
--- a/policyengine_us_data/db/etl_snap.py
+++ b/policyengine_us_data/db/etl_snap.py
@@ -134,10 +134,8 @@ def transform_administrative_snap_data(zip_file, year):
 
 def transform_survey_snap_data(raw_df):
     df = raw_df.copy()
-    return df[["GEO_ID", "S2201_C03_001E"]].rename({
-        "GEO_ID": "ucgid_str",
-        "S2201_C03_001E": "snap_household_ct"
-        }, axis=1
+    return df[["GEO_ID", "S2201_C03_001E"]].rename(
+        {"GEO_ID": "ucgid_str", "S2201_C03_001E": "snap_household_ct"}, axis=1
     )[
         ~df["GEO_ID"].isin(
             [  # Puerto Rico's state and district
@@ -160,7 +158,9 @@ def load_administrative_snap_data(df_states, year):
 
     # National ----------------
     nat_stratum = Stratum(
-        parent_stratum_id=None, stratum_group_id=0, notes="Geo: 0100000US Received SNAP Benefits"
+        parent_stratum_id=None,
+        stratum_group_id=0,
+        notes="Geo: 0100000US Received SNAP Benefits",
     )
     nat_stratum.constraints_rel = [
         StratumConstraint(
@@ -182,7 +182,7 @@ def load_administrative_snap_data(df_states, year):
     stratum_lookup["National"] = nat_stratum.stratum_id
 
     # State -------------------
-    stratum_lookup["State"] = {} 
+    stratum_lookup["State"] = {}
     for _, row in df_states.iterrows():
 
         note = f"Geo: {row['ucgid_str']} Received SNAP Benefits"
@@ -224,13 +224,13 @@ def load_administrative_snap_data(df_states, year):
         )
         session.add(new_stratum)
         session.flush()
-        stratum_lookup["State"][row['ucgid_str']] = new_stratum.stratum_id
+        stratum_lookup["State"][row["ucgid_str"]] = new_stratum.stratum_id
 
     session.commit()
     return stratum_lookup
 
 
-def load_survey_snap_data(survey_df, year, stratum_lookup ={}):
+def load_survey_snap_data(survey_df, year, stratum_lookup={}):
     """Use an already defined stratum_lookup to load the survey SNAP data"""
 
     DATABASE_URL = "sqlite:///policy_data.db"
@@ -243,8 +243,8 @@ def load_survey_snap_data(survey_df, year, stratum_lookup ={}):
     district_df = survey_df.copy()
     for _, row in district_df.iterrows():
         note = f"Geo: {row['ucgid_str']} Received SNAP Benefits"
-        state_ucgid_str = '0400000US' + row['ucgid_str'][9:11]
-        state_stratum_id = stratum_lookup['State'][state_ucgid_str]
+        state_ucgid_str = "0400000US" + row["ucgid_str"][9:11]
+        state_stratum_id = stratum_lookup["State"][state_ucgid_str]
         new_stratum = Stratum(
             parent_stratum_id=state_stratum_id, stratum_group_id=0, notes=note
         )
@@ -258,7 +258,7 @@ def load_survey_snap_data(survey_df, year, stratum_lookup ={}):
             StratumConstraint(
                 constraint_variable="snap",
                 operation="greater_than",
-                value='0',
+                value="0",
             ),
         ]
         new_stratum.targets_rel.append(
diff --git a/policyengine_us_data/db/temp.py b/policyengine_us_data/db/temp.py
index 6026ace0..0c687ff3 100644
--- a/policyengine_us_data/db/temp.py
+++ b/policyengine_us_data/db/temp.py
@@ -1,6 +1,8 @@
 # ucgid_str converts the household’s ucgid enumeration into a comma‑separated string of all hierarchical UCGID codes.
 from policyengine_us import Simulation
-from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID
+from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import (
+    UCGID,
+)
 
 # Minimal one-household simulation
 sim = Simulation(
@@ -22,36 +24,40 @@
 # First, let's explore UCGID, the enum, and how it can create the hierarchy
 
 import pandas as pd
-from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID
+from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import (
+    UCGID,
+)
 
 rows = []
 for node in UCGID:
     codes = node.get_hierarchical_codes()
-    rows.append({
-        "name":   node.name,
-        "code":   codes[0],
-        "parent": codes[1] if len(codes) > 1 else None
-    })
+    rows.append(
+        {
+            "name": node.name,
+            "code": codes[0],
+            "parent": codes[1] if len(codes) > 1 else None,
+        }
+    )
 
 hierarchy_df = (
     pd.DataFrame(rows)
-      .sort_values(["parent", "code"], na_position="first")
-      .reset_index(drop=True)
+    .sort_values(["parent", "code"], na_position="first")
+    .reset_index(drop=True)
 )
 
 print(hierarchy_df)
-#Out[262]: 
+# Out[262]:
 #      name           code       parent
-#0       US      0100000US         None
-#1       AL    0400000US01    0100000US
-#2       AK    0400000US02    0100000US
-#3       AZ    0400000US04    0100000US
-#4       AR    0400000US05    0100000US
-#..     ...            ...          ...
-#483  WI_05  5001800US5505  0400000US55
-#484  WI_06  5001800US5506  0400000US55
-#485  WI_07  5001800US5507  0400000US55
-#486  WI_08  5001800US5508  0400000US55
-#487  WY_01  5001800US5600  0400000US56
+# 0       US      0100000US         None
+# 1       AL    0400000US01    0100000US
+# 2       AK    0400000US02    0100000US
+# 3       AZ    0400000US04    0100000US
+# 4       AR    0400000US05    0100000US
+# ..     ...            ...          ...
+# 483  WI_05  5001800US5505  0400000US55
+# 484  WI_06  5001800US5506  0400000US55
+# 485  WI_07  5001800US5507  0400000US55
+# 486  WI_08  5001800US5508  0400000US55
+# 487  WY_01  5001800US5600  0400000US56
 #
-#[488 rows x 3 columns]
+# [488 rows x 3 columns]
diff --git a/policyengine_us_data/storage/calibration_targets/make_district_mapping.py b/policyengine_us_data/storage/calibration_targets/make_district_mapping.py
index cc3f50fb..72ff8d88 100644
--- a/policyengine_us_data/storage/calibration_targets/make_district_mapping.py
+++ b/policyengine_us_data/storage/calibration_targets/make_district_mapping.py
@@ -246,7 +246,11 @@ def get_district_mapping():
         mapping_matrix[i, j] = row.proportion
 
     assert np.allclose(mapping_matrix.sum(axis=1), 1.0)
-    return {'mapping_matrix': mapping_matrix, 'old_codes': old_codes, 'new_codes': new_codes}
+    return {
+        "mapping_matrix": mapping_matrix,
+        "old_codes": old_codes,
+        "new_codes": new_codes,
+    }
 
 
 if __name__ == "__main__":
diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py
index 018cb6a7..fb577e60 100644
--- a/policyengine_us_data/utils/census.py
+++ b/policyengine_us_data/utils/census.py
@@ -60,17 +60,57 @@
 }
 
 STATE_ABBREV_TO_FIPS = {
-    'AL': '01', 'AK': '02', 'AZ': '04', 'AR': '05', 'CA': '06',
-    'CO': '08', 'CT': '09', 'DE': '10', 'FL': '12', 'GA': '13',
-    'HI': '15', 'ID': '16', 'IL': '17', 'IN': '18', 'IA': '19',
-    'KS': '20', 'KY': '21', 'LA': '22', 'ME': '23', 'MD': '24',
-    'MA': '25', 'MI': '26', 'MN': '27', 'MS': '28', 'MO': '29',
-    'MT': '30', 'NE': '31', 'NV': '32', 'NH': '33', 'NJ': '34',
-    'NM': '35', 'NY': '36', 'NC': '37', 'ND': '38', 'OH': '39',
-    'OK': '40', 'OR': '41', 'PA': '42', 'RI': '44', 'SC': '45',
-    'SD': '46', 'TN': '47', 'TX': '48', 'UT': '49', 'VT': '50',
-    'VA': '51', 'WA': '53', 'WV': '54', 'WI': '55', 'WY': '56',
-    'DC': '11'
+    "AL": "01",
+    "AK": "02",
+    "AZ": "04",
+    "AR": "05",
+    "CA": "06",
+    "CO": "08",
+    "CT": "09",
+    "DE": "10",
+    "FL": "12",
+    "GA": "13",
+    "HI": "15",
+    "ID": "16",
+    "IL": "17",
+    "IN": "18",
+    "IA": "19",
+    "KS": "20",
+    "KY": "21",
+    "LA": "22",
+    "ME": "23",
+    "MD": "24",
+    "MA": "25",
+    "MI": "26",
+    "MN": "27",
+    "MS": "28",
+    "MO": "29",
+    "MT": "30",
+    "NE": "31",
+    "NV": "32",
+    "NH": "33",
+    "NJ": "34",
+    "NM": "35",
+    "NY": "36",
+    "NC": "37",
+    "ND": "38",
+    "OH": "39",
+    "OK": "40",
+    "OR": "41",
+    "PA": "42",
+    "RI": "44",
+    "SC": "45",
+    "SD": "46",
+    "TN": "47",
+    "TX": "48",
+    "UT": "49",
+    "VT": "50",
+    "VA": "51",
+    "WA": "53",
+    "WV": "54",
+    "WI": "55",
+    "WY": "56",
+    "DC": "11",
 }
 
 TERRITORY_UCGIDS = {
@@ -103,9 +143,9 @@ def pull_acs_table(group: str, geo: str, year: int) -> pd.DataFrame:
     "year": e.g., 2023
     """
     base = f"https://api.census.gov/data/{year}/acs/acs1"
-    
-    if group[0] == 'S':
-         base = base + "/subject"
+
+    if group[0] == "S":
+        base = base + "/subject"
     geo_q = {
         "National": "us:*",
         "State": "state:*",
diff --git a/policyengine_us_data/utils/db.py b/policyengine_us_data/utils/db.py
index bb484fab..a8081db4 100644
--- a/policyengine_us_data/utils/db.py
+++ b/policyengine_us_data/utils/db.py
@@ -3,7 +3,10 @@
 from sqlmodel import Session, select
 import sqlalchemy as sa
 
-from policyengine_us_data.db.create_database_tables import Stratum, StratumConstraint
+from policyengine_us_data.db.create_database_tables import (
+    Stratum,
+    StratumConstraint,
+)
 
 
 def get_stratum_by_id(session: Session, stratum_id: int) -> Optional[Stratum]:
@@ -11,14 +14,18 @@ def get_stratum_by_id(session: Session, stratum_id: int) -> Optional[Stratum]:
     return session.get(Stratum, stratum_id)
 
 
-def get_simple_stratum_by_ucgid(session: Session, ucgid: str) -> Optional[Stratum]:
+def get_simple_stratum_by_ucgid(
+    session: Session, ucgid: str
+) -> Optional[Stratum]:
     """
     Finds a stratum defined *only* by a single ucgid_str constraint.
     """
     constraint_count_subquery = (
         select(
             StratumConstraint.stratum_id,
-            sa.func.count(StratumConstraint.stratum_id).label("constraint_count")
+            sa.func.count(StratumConstraint.stratum_id).label(
+                "constraint_count"
+            ),
         )
         .group_by(StratumConstraint.stratum_id)
         .subquery()
@@ -29,7 +36,7 @@ def get_simple_stratum_by_ucgid(session: Session, ucgid: str) -> Optional[Stratu
         .join(StratumConstraint)
         .join(
             constraint_count_subquery,
-            Stratum.stratum_id == constraint_count_subquery.c.stratum_id
+            Stratum.stratum_id == constraint_count_subquery.c.stratum_id,
         )
         .where(StratumConstraint.constraint_variable == "ucgid_str")
         .where(StratumConstraint.value == ucgid)

From 9c3a460246d697f2c10b7f7bffc8ca5fde579ed7 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Thu, 14 Aug 2025 22:23:04 -0400
Subject: [PATCH 14/27] fixed national stratum in agi script

---
 policyengine_us_data/db/etl_irs_soi.py |  7 +--
 policyengine_us_data/db/temp.py        | 63 --------------------------
 2 files changed, 4 insertions(+), 66 deletions(-)
 delete mode 100644 policyengine_us_data/db/temp.py

diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
index a4a07cfe..5e28e464 100644
--- a/policyengine_us_data/db/etl_irs_soi.py
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -446,6 +446,7 @@ def load_soi_data(long_dfs, year):
 
         # Make a National Stratum for each AGI Stub, even though there's no national target
         # There no national target because the data set only has agi_stub = 0 for national
+        note = f"Geo: 0100000US, AGI > {agi_income_lower}, AGI < {agi_income_upper}"
         nat_stratum = Stratum(
             parent_stratum_id=None, stratum_group_id=0, notes=note
         )
@@ -454,7 +455,7 @@ def load_soi_data(long_dfs, year):
                 StratumConstraint(
                     constraint_variable="ucgid_str",
                     operation="in",
-                    value=ucgid_i,
+                    value="0100000US",
                 ),
                 StratumConstraint(
                     constraint_variable="agi",
@@ -515,7 +516,7 @@ def load_soi_data(long_dfs, year):
                     ),
                 ]
             )
-            new_stratum.targets_rel = [
+            new_stratum.targets_rel.append(
                 Target(
                     variable="person_count",
                     period=year,
@@ -523,7 +524,7 @@ def load_soi_data(long_dfs, year):
                     source_id=5,
                     active=True,
                 )
-            ]
+            )
 
             session.add(new_stratum)
             session.flush()
diff --git a/policyengine_us_data/db/temp.py b/policyengine_us_data/db/temp.py
deleted file mode 100644
index 0c687ff3..00000000
--- a/policyengine_us_data/db/temp.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# ucgid_str converts the household’s ucgid enumeration into a comma‑separated string of all hierarchical UCGID codes.
-from policyengine_us import Simulation
-from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import (
-    UCGID,
-)
-
-# Minimal one-household simulation
-sim = Simulation(
-    situation={
-        "people": {"p1": {}},
-        "households": {"h1": {"members": ["p1"]}},
-    }
-)
-
-# Assign a specific UCGID (California district 23)
-sim.set_input("ucgid", 2024, UCGID.CA_23)
-
-# Use the ucgid_str Variable's formula
-ucgid_str_val = sim.calculate("ucgid_str", 2024)
-print(ucgid_str_val)
-# ['5001800US0623,0400000US06,0100000US']
-
-
-# First, let's explore UCGID, the enum, and how it can create the hierarchy
-
-import pandas as pd
-from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import (
-    UCGID,
-)
-
-rows = []
-for node in UCGID:
-    codes = node.get_hierarchical_codes()
-    rows.append(
-        {
-            "name": node.name,
-            "code": codes[0],
-            "parent": codes[1] if len(codes) > 1 else None,
-        }
-    )
-
-hierarchy_df = (
-    pd.DataFrame(rows)
-    .sort_values(["parent", "code"], na_position="first")
-    .reset_index(drop=True)
-)
-
-print(hierarchy_df)
-# Out[262]:
-#      name           code       parent
-# 0       US      0100000US         None
-# 1       AL    0400000US01    0100000US
-# 2       AK    0400000US02    0100000US
-# 3       AZ    0400000US04    0100000US
-# 4       AR    0400000US05    0100000US
-# ..     ...            ...          ...
-# 483  WI_05  5001800US5505  0400000US55
-# 484  WI_06  5001800US5506  0400000US55
-# 485  WI_07  5001800US5507  0400000US55
-# 486  WI_08  5001800US5508  0400000US55
-# 487  WY_01  5001800US5600  0400000US56
-#
-# [488 rows x 3 columns]

From 81e2011826e3733bafdddfbf79338931836b208e Mon Sep 17 00:00:00 2001
From: Ben Ogorek <baogorek@gmail.com>
Date: Fri, 15 Aug 2025 08:43:33 -0400
Subject: [PATCH 15/27] refactor: use sqlmodel session

---
 .../db/create_initial_strata.py               |  50 ++---
 policyengine_us_data/db/etl_age.py            | 122 +++++------
 policyengine_us_data/db/etl_medicaid.py       | 168 +++++++--------
 policyengine_us_data/db/etl_snap.py           | 203 +++++++++---------
 4 files changed, 266 insertions(+), 277 deletions(-)

diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py
index a2a333df..068bca30 100644
--- a/policyengine_us_data/db/create_initial_strata.py
+++ b/policyengine_us_data/db/create_initial_strata.py
@@ -1,9 +1,7 @@
 from typing import Dict
 
 import pandas as pd
-from sqlalchemy import create_engine
-from sqlalchemy.orm import sessionmaker
-from sqlmodel import SQLModel, Session, select
+from sqlmodel import Session, create_engine
 
 
 from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import (
@@ -37,40 +35,38 @@ def main():
     DATABASE_URL = "sqlite:///policy_data.db"
     engine = create_engine(DATABASE_URL)
 
-    Session = sessionmaker(bind=engine)
-    session = Session()
-
     # map the ucgid_str 'code' to auto-generated 'stratum_id'
     code_to_stratum_id: Dict[str, int] = {}
 
-    for _, row in hierarchy_df.iterrows():
-        parent_code = row["parent"]
-
-        parent_id = (
-            code_to_stratum_id.get(parent_code) if parent_code else None
-        )
+    with Session(engine) as session:
+        for _, row in hierarchy_df.iterrows():
+            parent_code = row["parent"]
 
-        new_stratum = Stratum(
-            parent_stratum_id=parent_id,
-            notes=f'{row["name"]} (ucgid {row["code"]})',
-            stratum_group_id=1,
-        )
+            parent_id = (
+                code_to_stratum_id.get(parent_code) if parent_code else None
+            )
 
-        new_stratum.constraints_rel = [
-            StratumConstraint(
-                constraint_variable="ucgid_str",
-                operation="in",
-                value=row["code"],
+            new_stratum = Stratum(
+                parent_stratum_id=parent_id,
+                notes=f'{row["name"]} (ucgid {row["code"]})',
+                stratum_group_id=1,
             )
-        ]
 
-        session.add(new_stratum)
+            new_stratum.constraints_rel = [
+                StratumConstraint(
+                    constraint_variable="ucgid_str",
+                    operation="in",
+                    value=row["code"],
+                )
+            ]
+
+            session.add(new_stratum)
 
-        session.flush()
+            session.flush()
 
-        code_to_stratum_id[row["code"]] = new_stratum.stratum_id
+            code_to_stratum_id[row["code"]] = new_stratum.stratum_id
 
-    session.commit()
+        session.commit()
 
 
 if __name__ == "__main__":
diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py
index 7bb36ed4..bc540373 100644
--- a/policyengine_us_data/db/etl_age.py
+++ b/policyengine_us_data/db/etl_age.py
@@ -1,11 +1,6 @@
-import requests
-from pathlib import Path
-import io
-
 import pandas as pd
 import numpy as np
-from sqlalchemy import create_engine
-from sqlalchemy.orm import sessionmaker
+from sqlmodel import Session, create_engine
 
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
@@ -95,7 +90,7 @@ def get_parent_geo(geo):
     return {"National": None, "State": "National", "District": "State"}[geo]
 
 
-def load_age_data(df_long, geo, year, stratum_lookup={}):
+def load_age_data(df_long, geo, year, stratum_lookup=None):
 
     # Quick data quality check before loading ----
     if geo == "National":
@@ -111,78 +106,77 @@ def load_age_data(df_long, geo, year, stratum_lookup={}):
     DATABASE_URL = "sqlite:///policy_data.db"
     engine = create_engine(DATABASE_URL)
 
-    Session = sessionmaker(bind=engine)
-    session = Session()
-
-    if not stratum_lookup:
+    if stratum_lookup is None:
         if geo != "National":
             raise ValueError("Include stratum_lookup unless National geo")
         stratum_lookup = {"National": {}}
     else:
         stratum_lookup[geo] = {}
 
-    for _, row in df_long.iterrows():
-
-        # Create the parent Stratum object.
-        # We will attach children to it before adding it to the session.
-        note = f"Age: {row['age_range']}, Geo: {row['ucgid_str']}"
-        parent_geo = get_parent_geo(geo)
-        parent_stratum_id = (
-            stratum_lookup[parent_geo][row["age_range"]]
-            if parent_geo
-            else None
-        )
-
-        new_stratum = Stratum(
-            parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note
-        )
-
-        # Create constraints and link them to the parent's relationship attribute.
-        # TODO: greater_than_or_equal_to to just greater than!
-        new_stratum.constraints_rel = [
-            StratumConstraint(
-                constraint_variable="ucgid_str",
-                operation="in",
-                value=row["ucgid_str"],
-            ),
-            StratumConstraint(
-                constraint_variable="age",
-                operation="greater_than",
-                value=str(row["age_greater_than"]),
-            ),
-        ]
+    with Session(engine) as session:
+        for _, row in df_long.iterrows():
+            # Create the parent Stratum object.
+            # We will attach children to it before adding it to the session.
+            note = f"Age: {row['age_range']}, Geo: {row['ucgid_str']}"
+            parent_geo = get_parent_geo(geo)
+            parent_stratum_id = (
+                stratum_lookup[parent_geo][row["age_range"]]
+                if parent_geo
+                else None
+            )
 
-        age_lt_value = row["age_less_than"]
-        if not np.isinf(age_lt_value):
-            new_stratum.constraints_rel.append(
+            new_stratum = Stratum(
+                parent_stratum_id=parent_stratum_id,
+                stratum_group_id=0,
+                notes=note,
+            )
+
+            # Create constraints and link them to the parent's relationship attribute.
+            # TODO: greater_than_or_equal_to to just greater than!
+            new_stratum.constraints_rel = [
+                StratumConstraint(
+                    constraint_variable="ucgid_str",
+                    operation="in",
+                    value=row["ucgid_str"],
+                ),
                 StratumConstraint(
                     constraint_variable="age",
-                    operation="less_than",
-                    value=str(row["age_less_than"]),
+                    operation="greater_than",
+                    value=str(row["age_greater_than"]),
+                ),
+            ]
+
+            age_lt_value = row["age_less_than"]
+            if not np.isinf(age_lt_value):
+                new_stratum.constraints_rel.append(
+                    StratumConstraint(
+                        constraint_variable="age",
+                        operation="less_than",
+                        value=str(row["age_less_than"]),
+                    )
                 )
-            )
 
-        # Create the Target and link it to the parent.
-        new_stratum.targets_rel.append(
-            Target(
-                variable=row["variable"],
-                period=year,
-                value=row["value"],
-                source_id=row["source_id"],
-                active=row["active"],
+            # Create the Target and link it to the parent.
+            new_stratum.targets_rel.append(
+                Target(
+                    variable=row["variable"],
+                    period=year,
+                    value=row["value"],
+                    source_id=row["source_id"],
+                    active=row["active"],
+                )
             )
-        )
 
-        # Add ONLY the parent object to the session.
-        # The 'cascade' setting will handle the children automatically.
-        session.add(new_stratum)
+            # Add ONLY the parent object to the session.
+            # The 'cascade' setting will handle the children automatically.
+            session.add(new_stratum)
 
-        # Flush to get the id
-        session.flush()
-        stratum_lookup[geo][row["age_range"]] = new_stratum.stratum_id
+            # Flush to get the id
+            session.flush()
+            stratum_lookup[geo][row["age_range"]] = new_stratum.stratum_id
 
-    # Commit all the new objects at once.
-    session.commit()
+        # Commit all the new objects at once.
+        session.commit()
 
     return stratum_lookup
 
diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py
index 3a5ab7d7..4ff96278 100644
--- a/policyengine_us_data/db/etl_medicaid.py
+++ b/policyengine_us_data/db/etl_medicaid.py
@@ -1,8 +1,7 @@
 import requests
 
 import pandas as pd
-from sqlalchemy import create_engine
-from sqlalchemy.orm import sessionmaker
+from sqlmodel import Session, create_engine
 
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
@@ -86,52 +85,21 @@ def load_medicaid_data(long_state, long_cd, year):
 
     DATABASE_URL = "sqlite:///policy_data.db"
     engine = create_engine(DATABASE_URL)
-    year = 2023
-
-    Session = sessionmaker(bind=engine)
-    session = Session()
 
     stratum_lookup = {}
 
-    # National ----------------
-    nat_stratum = Stratum(
-        parent_stratum_id=None,
-        stratum_group_id=0,
-        notes="Geo: 0100000US Medicaid Enrolled",
-    )
-    nat_stratum.constraints_rel = [
-        StratumConstraint(
-            constraint_variable="ucgid_str",
-            operation="in",
-            value="0100000US",
-        ),
-        StratumConstraint(
-            constraint_variable="medicaid_enrolled",
-            operation="equals",
-            value="True",
-        ),
-    ]
-    # No target at the national level is provided at this time.
-
-    session.add(nat_stratum)
-    session.flush()
-    stratum_lookup["National"] = nat_stratum.stratum_id
-
-    # State -------------------
-    stratum_lookup["State"] = {}
-    for _, row in long_state.iterrows():
-
-        note = f"Geo: {row['ucgid_str']} Medicaid Enrolled"
-        parent_stratum_id = nat_stratum.stratum_id
-
-        new_stratum = Stratum(
-            parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note
+    with Session(engine) as session:
+        # National ----------------
+        nat_stratum = Stratum(
+            parent_stratum_id=None,
+            stratum_group_id=0,
+            notes="Geo: 0100000US Medicaid Enrolled",
         )
-        new_stratum.constraints_rel = [
+        nat_stratum.constraints_rel = [
             StratumConstraint(
                 constraint_variable="ucgid_str",
                 operation="in",
-                value=row["ucgid_str"],
+                value="0100000US",
             ),
             StratumConstraint(
                 constraint_variable="medicaid_enrolled",
@@ -139,55 +107,87 @@ def load_medicaid_data(long_state, long_cd, year):
                 value="True",
             ),
         ]
-        new_stratum.targets_rel.append(
-            Target(
-                variable="person_count",
-                period=year,
-                value=row["medicaid_enrollment"],
-                source_id=2,
-                active=True,
-            )
-        )
-        session.add(new_stratum)
+        # No target at the national level is provided at this time.
+
+        session.add(nat_stratum)
         session.flush()
-        stratum_lookup["State"][row["ucgid_str"]] = new_stratum.stratum_id
+        stratum_lookup["National"] = nat_stratum.stratum_id
 
-    # District -------------------
-    for _, row in long_cd.iterrows():
+        # State -------------------
+        stratum_lookup["State"] = {}
+        for _, row in long_state.iterrows():
 
-        note = f"Geo: {row['ucgid_str']} Medicaid Enrolled"
-        parent_stratum_id = stratum_lookup["State"][
-            f'0400000US{row["ucgid_str"][-4:-2]}'
-        ]
+            note = f"Geo: {row['ucgid_str']} Medicaid Enrolled"
+            parent_stratum_id = nat_stratum.stratum_id
 
-        new_stratum = Stratum(
-            parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note
-        )
-        new_stratum.constraints_rel = [
-            StratumConstraint(
-                constraint_variable="ucgid_str",
-                operation="in",
-                value=row["ucgid_str"],
-            ),
-            StratumConstraint(
-                constraint_variable="medicaid_enrolled",
-                operation="equals",
-                value="True",
-            ),
-        ]
-        new_stratum.targets_rel.append(
-            Target(
-                variable="person_count",
-                period=year,
-                value=row["medicaid_enrollment"],
-                source_id=2,
-                active=True,
+            new_stratum = Stratum(
+                parent_stratum_id=parent_stratum_id,
+                stratum_group_id=0,
+                notes=note,
             )
-        )
-        session.add(new_stratum)
-        session.flush()
+            new_stratum.constraints_rel = [
+                StratumConstraint(
+                    constraint_variable="ucgid_str",
+                    operation="in",
+                    value=row["ucgid_str"],
+                ),
+                StratumConstraint(
+                    constraint_variable="medicaid_enrolled",
+                    operation="equals",
+                    value="True",
+                ),
+            ]
+            new_stratum.targets_rel.append(
+                Target(
+                    variable="person_count",
+                    period=year,
+                    value=row["medicaid_enrollment"],
+                    source_id=2,
+                    active=True,
+                )
+            )
+            session.add(new_stratum)
+            session.flush()
+            stratum_lookup["State"][row["ucgid_str"]] = new_stratum.stratum_id
+
+        # District -------------------
+        for _, row in long_cd.iterrows():
+
+            note = f"Geo: {row['ucgid_str']} Medicaid Enrolled"
+            parent_stratum_id = stratum_lookup["State"][
+                f'0400000US{row["ucgid_str"][-4:-2]}'
+            ]
+
+            new_stratum = Stratum(
+                parent_stratum_id=parent_stratum_id,
+                stratum_group_id=0,
+                notes=note,
+            )
+            new_stratum.constraints_rel = [
+                StratumConstraint(
+                    constraint_variable="ucgid_str",
+                    operation="in",
+                    value=row["ucgid_str"],
+                ),
+                StratumConstraint(
+                    constraint_variable="medicaid_enrolled",
+                    operation="equals",
+                    value="True",
+                ),
+            ]
+            new_stratum.targets_rel.append(
+                Target(
+                    variable="person_count",
+                    period=year,
+                    value=row["medicaid_enrollment"],
+                    source_id=2,
+                    active=True,
+                )
+            )
+            session.add(new_stratum)
+            session.flush()
 
-    session.commit()
+        session.commit()
 
 
 if __name__ == "__main__":
diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py
index fb110025..a60c0074 100644
--- a/policyengine_us_data/db/etl_snap.py
+++ b/policyengine_us_data/db/etl_snap.py
@@ -1,15 +1,11 @@
 import requests
 import zipfile
 import io
-import os
-import re
-from pathlib import Path
 
 import pandas as pd
 import numpy as np
 import us
-from sqlalchemy import create_engine
-from sqlalchemy.orm import sessionmaker
+from sqlmodel import Session, create_engine
 
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
@@ -151,129 +147,132 @@ def load_administrative_snap_data(df_states, year):
     DATABASE_URL = "sqlite:///policy_data.db"
     engine = create_engine(DATABASE_URL)
 
-    Session = sessionmaker(bind=engine)
-    session = Session()
-
     stratum_lookup = {}
 
-    # National ----------------
-    nat_stratum = Stratum(
-        parent_stratum_id=None,
-        stratum_group_id=0,
-        notes="Geo: 0100000US Received SNAP Benefits",
-    )
-    nat_stratum.constraints_rel = [
-        StratumConstraint(
-            constraint_variable="ucgid_str",
-            operation="in",
-            value="0100000US",
-        ),
-        StratumConstraint(
-            constraint_variable="snap",
-            operation="is_greater_than",
-            value="0",
-        ),
-    ]
-    # No target at the national level is provided at this time. Keeping it
-    # so that the state strata can have a parent stratum
-
-    session.add(nat_stratum)
-    session.flush()
-    stratum_lookup["National"] = nat_stratum.stratum_id
-
-    # State -------------------
-    stratum_lookup["State"] = {}
-    for _, row in df_states.iterrows():
-
-        note = f"Geo: {row['ucgid_str']} Received SNAP Benefits"
-        parent_stratum_id = nat_stratum.stratum_id
-
-        new_stratum = Stratum(
-            parent_stratum_id=parent_stratum_id, stratum_group_id=0, notes=note
+    with Session(engine) as session:
+        # National ----------------
+        nat_stratum = Stratum(
+            parent_stratum_id=None,
+            stratum_group_id=0,
+            notes="Geo: 0100000US Received SNAP Benefits",
         )
-        new_stratum.constraints_rel = [
+        nat_stratum.constraints_rel = [
             StratumConstraint(
                 constraint_variable="ucgid_str",
                 operation="in",
-                value=row["ucgid_str"],
+                value="0100000US",
             ),
             StratumConstraint(
                 constraint_variable="snap",
-                operation="is_greater_than",
+                operation="greater_than",
                 value="0",
             ),
         ]
-        # Two targets now. Same data source. Same stratum
-        new_stratum.targets_rel.append(
-            Target(
-                variable="household_count",
-                period=year,
-                value=row["Households"],
-                source_id=3,
-                active=True,
+        # No target at the national level is provided at this time. Keeping it
+        # so that the state strata can have a parent stratum
+
+        session.add(nat_stratum)
+        session.flush()
+        stratum_lookup["National"] = nat_stratum.stratum_id
+
+        # State -------------------
+        stratum_lookup["State"] = {}
+        for _, row in df_states.iterrows():
+
+            note = f"Geo: {row['ucgid_str']} Received SNAP Benefits"
+            parent_stratum_id = nat_stratum.stratum_id
+
+            new_stratum = Stratum(
+                parent_stratum_id=parent_stratum_id,
+                stratum_group_id=0,
+                notes=note,
             )
-        )
-        new_stratum.targets_rel.append(
-            Target(
-                variable="snap",
-                period=year,
-                value=row["Cost"],
-                source_id=3,
-                active=True,
+            new_stratum.constraints_rel = [
+                StratumConstraint(
+                    constraint_variable="ucgid_str",
+                    operation="in",
+                    value=row["ucgid_str"],
+                ),
+                StratumConstraint(
+                    constraint_variable="snap",
+                    operation="greater_than",
+                    value="0",
+                ),
+            ]
+            # Two targets now. Same data source. Same stratum
+            new_stratum.targets_rel.append(
+                Target(
+                    variable="household_count",
+                    period=year,
+                    value=row["Households"],
+                    source_id=3,
+                    active=True,
+                )
             )
-        )
-        session.add(new_stratum)
-        session.flush()
-        stratum_lookup["State"][row["ucgid_str"]] = new_stratum.stratum_id
+            new_stratum.targets_rel.append(
+                Target(
+                    variable="snap",
+                    period=year,
+                    value=row["Cost"],
+                    source_id=3,
+                    active=True,
+                )
+            )
+            session.add(new_stratum)
+            session.flush()
+            stratum_lookup["State"][row["ucgid_str"]] = new_stratum.stratum_id
 
-    session.commit()
+        session.commit()
     return stratum_lookup
 
 
-def load_survey_snap_data(survey_df, year, stratum_lookup={}):
+def load_survey_snap_data(survey_df, year, stratum_lookup=None):
     """Use an already defined stratum_lookup to load the survey SNAP data"""
 
+    if stratum_lookup is None:
+        raise ValueError("stratum_lookup must be provided")
+
     DATABASE_URL = "sqlite:///policy_data.db"
     engine = create_engine(DATABASE_URL)
 
-    Session = sessionmaker(bind=engine)
-    session = Session()
-
-    # Create new strata for districts whose households recieve SNAP benefits
-    district_df = survey_df.copy()
-    for _, row in district_df.iterrows():
-        note = f"Geo: {row['ucgid_str']} Received SNAP Benefits"
-        state_ucgid_str = "0400000US" + row["ucgid_str"][9:11]
-        state_stratum_id = stratum_lookup["State"][state_ucgid_str]
-        new_stratum = Stratum(
-            parent_stratum_id=state_stratum_id, stratum_group_id=0, notes=note
-        )
+    with Session(engine) as session:
+        # Create new strata for districts whose households recieve SNAP benefits
+        district_df = survey_df.copy()
+        for _, row in district_df.iterrows():
+            note = f"Geo: {row['ucgid_str']} Received SNAP Benefits"
+            state_ucgid_str = "0400000US" + row["ucgid_str"][9:11]
+            state_stratum_id = stratum_lookup["State"][state_ucgid_str]
+            new_stratum = Stratum(
+                parent_stratum_id=state_stratum_id,
+                stratum_group_id=0,
+                notes=note,
+            )
 
-        new_stratum.constraints_rel = [
-            StratumConstraint(
-                constraint_variable="ucgid_str",
-                operation="in",
-                value=row["ucgid_str"],
-            ),
-            StratumConstraint(
-                constraint_variable="snap",
-                operation="greater_than",
-                value="0",
-            ),
-        ]
-        new_stratum.targets_rel.append(
-            Target(
-                variable="household_count",
-                period=year,
-                value=row["snap_household_ct"],
-                source_id=4,
-                active=True,
+            new_stratum.constraints_rel = [
+                StratumConstraint(
+                    constraint_variable="ucgid_str",
+                    operation="in",
+                    value=row["ucgid_str"],
+                ),
+                StratumConstraint(
+                    constraint_variable="snap",
+                    operation="greater_than",
+                    value="0",
+                ),
+            ]
+            new_stratum.targets_rel.append(
+                Target(
+                    variable="household_count",
+                    period=year,
+                    value=row["snap_household_ct"],
+                    source_id=4,
+                    active=True,
+                )
             )
-        )
-        session.add(new_stratum)
-        session.flush()
+            session.add(new_stratum)
+            session.flush()
 
-    session.commit()
+        session.commit()
 
     return stratum_lookup
 

From d5b3571cf6a592d14efccfe0ab7230bd12bc01c0 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Fri, 15 Aug 2025 08:50:06 -0400
Subject: [PATCH 16/27] storage file updates

---
 policyengine_us_data/storage/README.md        |    6 +
 .../storage/district_mapping.csv              | 1502 +++++++++++++++++
 .../storage/upload_completed_datasets.py      |    1 +
 3 files changed, 1509 insertions(+)
 create mode 100644 policyengine_us_data/storage/district_mapping.csv

diff --git a/policyengine_us_data/storage/README.md b/policyengine_us_data/storage/README.md
index 55f98ed9..d2c1f054 100644
--- a/policyengine_us_data/storage/README.md
+++ b/policyengine_us_data/storage/README.md
@@ -9,3 +9,9 @@
   • Source: MACPAC Enrollment Tables, FFY 2024  
   • Date: 2024  
   • Location: https://www.medicaid.gov/resources-for-states/downloads/eligib-oper-and-enrol-snap-december2024.pdf#page=26
+
+- **district_mapping.csv**  
+  • Source: created by the script `policyengine_us/storage/calibration_targets/make_district_mapping.py`
+  • Notes: this script is not part of `make data` because of the length of time it takes to run and the
+    likelhood of timeout errors. See the script for more notes, including an alternative source. Also,
+    once the IRS SOI updates their data in 2026, this mapping will likely be unncessesary. 
diff --git a/policyengine_us_data/storage/district_mapping.csv b/policyengine_us_data/storage/district_mapping.csv
new file mode 100644
index 00000000..fb5eef63
--- /dev/null
+++ b/policyengine_us_data/storage/district_mapping.csv
@@ -0,0 +1,1502 @@
+code_old,code_new,proportion
+5001800US0101,5001800US0102,0.5011459283535776
+5001800US0101,5001800US0101,0.4820707064326126
+5001800US0101,5001800US0107,0.016783365213809763
+5001800US0102,5001800US0101,0.4678169491253603
+5001800US0102,5001800US0102,0.39705192508930826
+5001800US0102,5001800US0106,0.1351311257853314
+5001800US0103,5001800US0103,0.8198603796970461
+5001800US0103,5001800US0102,0.180139620302954
+5001800US0104,5001800US0104,0.7692270137962715
+5001800US0104,5001800US0103,0.17447390203471147
+5001800US0104,5001800US0105,0.04907806643602609
+5001800US0104,5001800US0107,0.007221017732990997
+5001800US0105,5001800US0105,0.878817662367874
+5001800US0105,5001800US0104,0.12118233763212609
+5001800US0106,5001800US0106,0.7473312551163003
+5001800US0106,5001800US0107,0.22864246122636855
+5001800US0106,5001800US0104,0.024026283657331227
+5001800US0107,5001800US0107,0.8633782301260244
+5001800US0107,5001800US0102,0.08164855122567322
+5001800US0107,5001800US0104,0.034608328493330016
+5001800US0107,5001800US0106,0.020364890154972295
+5001800US0200,5001800US0200,1.0
+5001800US0401,5001800US0402,0.6948503590446475
+5001800US0401,5001800US0406,0.29812837069923254
+5001800US0401,5001800US0407,0.007021270256119934
+5001800US0402,5001800US0406,0.7492521226985535
+5001800US0402,5001800US0407,0.25074787730144654
+5001800US0403,5001800US0407,0.7754175253607597
+5001800US0403,5001800US0403,0.12104397018340192
+5001800US0403,5001800US0409,0.08970929465623763
+5001800US0403,5001800US0406,0.013829209799600803
+5001800US0403,5001800US0402,0.0
+5001800US0404,5001800US0409,0.46093471548467213
+5001800US0404,5001800US0402,0.32851231142675985
+5001800US0404,5001800US0405,0.19748931684056112
+5001800US0404,5001800US0401,0.007724890354579596
+5001800US0404,5001800US0407,0.005338765893427278
+5001800US0404,5001800US0406,0.0
+5001800US0404,5001800US0408,0.0
+5001800US0405,5001800US0405,0.603375246029751
+5001800US0405,5001800US0404,0.3889039941038361
+5001800US0405,5001800US0401,0.007720759866412899
+5001800US0406,5001800US0401,0.7549732899257154
+5001800US0406,5001800US0408,0.24332248142885407
+5001800US0406,5001800US0404,0.0017042286454305089
+5001800US0407,5001800US0403,0.8274880581174503
+5001800US0407,5001800US0409,0.0731090792934076
+5001800US0407,5001800US0408,0.049613133214102066
+5001800US0407,5001800US0401,0.041687081990827386
+5001800US0407,5001800US0407,0.008038587992499856
+5001800US0407,5001800US0404,6.405939171279232e-05
+5001800US0408,5001800US0408,0.7247876203185409
+5001800US0408,5001800US0409,0.27432545665102437
+5001800US0408,5001800US0403,0.0008836562789413765
+5001800US0408,5001800US0401,3.2667514933137765e-06
+5001800US0408,5001800US0407,0.0
+5001800US0409,5001800US0404,0.6942271093254387
+5001800US0409,5001800US0401,0.23192258506278385
+5001800US0409,5001800US0403,0.05066437655628968
+5001800US0409,5001800US0408,0.023060778551217485
+5001800US0409,5001800US0405,0.0001251505042702215
+5001800US0409,5001800US0402,0.0
+5001800US0501,5001800US0501,0.9636508992922538
+5001800US0501,5001800US0502,0.030829978572819945
+5001800US0501,5001800US0504,0.005519122134926303
+5001800US0502,5001800US0502,0.9185648948063828
+5001800US0502,5001800US0504,0.06575058440898465
+5001800US0502,5001800US0501,0.015684520784632585
+5001800US0503,5001800US0503,0.8207885304659498
+5001800US0503,5001800US0501,0.11114866688730685
+5001800US0503,5001800US0504,0.06806280264674333
+5001800US0504,5001800US0504,0.937181462111058
+5001800US0504,5001800US0503,0.06281853788894193
+5001800US0601,5001800US0601,0.7902007441225233
+5001800US0601,5001800US0603,0.20979925587747675
+5001800US0602,5001800US0602,0.9992455338468796
+5001800US0602,5001800US0604,0.0007544661531203383
+5001800US0603,5001800US0604,0.44723347944752295
+5001800US0603,5001800US0601,0.25560513928793605
+5001800US0603,5001800US0608,0.2287328339799995
+5001800US0603,5001800US0606,0.0501127320716451
+5001800US0603,5001800US0607,0.01653583513428961
+5001800US0603,5001800US0603,0.001779980078606797
+5001800US0604,5001800US0603,0.6014165640096494
+5001800US0604,5001800US0605,0.38940702976059494
+5001800US0604,5001800US0620,0.009176406229755659
+5001800US0604,5001800US0613,0.0
+5001800US0605,5001800US0604,0.5395799676898223
+5001800US0605,5001800US0608,0.3792849775928005
+5001800US0605,5001800US0602,0.06123965207232369
+5001800US0605,5001800US0610,0.01989540264505353
+5001800US0606,5001800US0607,0.5752703267757333
+5001800US0606,5001800US0606,0.4243660728533943
+5001800US0606,5001800US0604,0.0003636003708723783
+5001800US0607,5001800US0606,0.5304593199540961
+5001800US0607,5001800US0607,0.2849996108440379
+5001800US0607,5001800US0603,0.18454106920186597
+5001800US0608,5001800US0623,0.7911347146383424
+5001800US0608,5001800US0633,0.09254544094547086
+5001800US0608,5001800US0603,0.07483581419702297
+5001800US0608,5001800US0628,0.02122223053332336
+5001800US0608,5001800US0625,0.020261799685840378
+5001800US0609,5001800US0609,0.7449549952013407
+5001800US0609,5001800US0610,0.12939921639709387
+5001800US0609,5001800US0608,0.058466694739002095
+5001800US0609,5001800US0607,0.03610857581347429
+5001800US0609,5001800US0613,0.03107051784908911
+5001800US0610,5001800US0605,0.46922839800977745
+5001800US0610,5001800US0613,0.36093406736538647
+5001800US0610,5001800US0609,0.1698375346248361
+5001800US0611,5001800US0610,0.5421216175756663
+5001800US0611,5001800US0608,0.4577227617261042
+5001800US0611,5001800US0609,0.00015562069822947625
+5001800US0612,5001800US0611,0.8824556656398301
+5001800US0612,5001800US0615,0.11754433436016984
+5001800US0613,5001800US0612,0.968319632028912
+5001800US0613,5001800US0614,0.03168036797108799
+5001800US0613,5001800US0617,0.0
+5001800US0614,5001800US0615,0.8147246523351065
+5001800US0614,5001800US0616,0.10011027095148078
+5001800US0614,5001800US0611,0.08516507671341275
+5001800US0615,5001800US0614,0.879209193527972
+5001800US0615,5001800US0610,0.11949548033093281
+5001800US0615,5001800US0617,0.001295326141095208
+5001800US0615,5001800US0612,0.0
+5001800US0616,5001800US0621,0.576124338019409
+5001800US0616,5001800US0613,0.4206240681965774
+5001800US0616,5001800US0620,0.002105990066861183
+5001800US0616,5001800US0605,0.0011456037171525002
+5001800US0617,5001800US0617,0.8767390155851017
+5001800US0617,5001800US0614,0.11236828974787944
+5001800US0617,5001800US0616,0.009779211059258606
+5001800US0617,5001800US0618,0.0011134836077602653
+5001800US0618,5001800US0616,0.7235221984095743
+5001800US0618,5001800US0619,0.10660128717891398
+5001800US0618,5001800US0615,0.08535457356731285
+5001800US0618,5001800US0617,0.07351142602123051
+5001800US0618,5001800US0618,0.011010514822968352
+5001800US0619,5001800US0618,0.4510363311700647
+5001800US0619,5001800US0619,0.32974361985417794
+5001800US0619,5001800US0616,0.1975073480905326
+5001800US0619,5001800US0617,0.021712700885224725
+5001800US0620,5001800US0618,0.5925036040365209
+5001800US0620,5001800US0619,0.4074963959634791
+5001800US0621,5001800US0622,0.5329178551896014
+5001800US0621,5001800US0621,0.18150259864876814
+5001800US0621,5001800US0613,0.17215597787464262
+5001800US0621,5001800US0620,0.11342356828698776
+5001800US0622,5001800US0620,0.41246978264039647
+5001800US0622,5001800US0621,0.321831053895241
+5001800US0622,5001800US0605,0.1605561021095157
+5001800US0622,5001800US0622,0.10514306135484679
+5001800US0623,5001800US0620,0.4311170291854314
+5001800US0623,5001800US0622,0.3619284258105871
+5001800US0623,5001800US0627,0.16211411824180327
+5001800US0623,5001800US0623,0.03673538167724541
+5001800US0623,5001800US0621,0.00810504508493281
+5001800US0624,5001800US0624,0.909745972391165
+5001800US0624,5001800US0619,0.09025402760883502
+5001800US0624,5001800US0626,0.0
+5001800US0625,5001800US0627,0.7963998249467975
+5001800US0625,5001800US0626,0.16642496739205054
+5001800US0625,5001800US0623,0.03555339122674538
+5001800US0625,5001800US0632,0.001615380654904922
+5001800US0625,5001800US0628,6.435779501613235e-06
+5001800US0626,5001800US0626,0.8517580843638418
+5001800US0626,5001800US0624,0.14824191563615824
+5001800US0627,5001800US0628,0.9333023507342876
+5001800US0627,5001800US0631,0.04593987122505196
+5001800US0627,5001800US0630,0.010313825562334173
+5001800US0627,5001800US0635,0.009475605428151353
+5001800US0627,5001800US0638,0.0009683470501749108
+5001800US0627,5001800US0634,0.0
+5001800US0628,5001800US0630,0.9093157999708373
+5001800US0628,5001800US0628,0.06209304235460142
+5001800US0628,5001800US0634,0.016399816979048125
+5001800US0628,5001800US0632,0.009668936551060667
+5001800US0628,5001800US0629,0.0022592696257288576
+5001800US0628,5001800US0627,0.0002631345187236132
+5001800US0629,5001800US0629,0.9349816769417437
+5001800US0629,5001800US0632,0.043869984424961504
+5001800US0629,5001800US0627,0.019276524536392385
+5001800US0629,5001800US0630,0.001871814096902376
+5001800US0630,5001800US0632,0.7750581173919288
+5001800US0630,5001800US0629,0.11788517210448435
+5001800US0630,5001800US0627,0.06792398640562922
+5001800US0630,5001800US0630,0.038539583302982565
+5001800US0630,5001800US0626,0.0005931407949751609
+5001800US0631,5001800US0633,0.7205763667291623
+5001800US0631,5001800US0635,0.1382415144629724
+5001800US0631,5001800US0628,0.07333507162161602
+5001800US0631,5001800US0623,0.06784704718624932
+5001800US0632,5001800US0631,0.9862601829590875
+5001800US0632,5001800US0638,0.0120510508192493
+5001800US0632,5001800US0628,0.0016887662216632747
+5001800US0632,5001800US0635,0.0
+5001800US0633,5001800US0636,0.7322890602472397
+5001800US0633,5001800US0632,0.11928339549982792
+5001800US0633,5001800US0630,0.07530486386932944
+5001800US0633,5001800US0644,0.04079176239686718
+5001800US0633,5001800US0626,0.03222115769205726
+5001800US0633,5001800US0643,0.00010976029467848605
+5001800US0634,5001800US0634,0.9419729727306151
+5001800US0634,5001800US0637,0.037979859572979904
+5001800US0634,5001800US0630,0.01977097662239838
+5001800US0634,5001800US0628,0.0002761910740066537
+5001800US0635,5001800US0635,0.817698648971904
+5001800US0635,5001800US0633,0.17846596699720388
+5001800US0635,5001800US0638,0.003835384030892129
+5001800US0635,5001800US0631,0.0
+5001800US0635,5001800US0640,0.0
+5001800US0636,5001800US0625,0.7774587938511587
+5001800US0636,5001800US0641,0.21970444069455705
+5001800US0636,5001800US0648,0.0028367654542842463
+5001800US0637,5001800US0637,0.7674439153156702
+5001800US0637,5001800US0636,0.20994549380477542
+5001800US0637,5001800US0634,0.00766878788603953
+5001800US0637,5001800US0630,0.007550976454280881
+5001800US0637,5001800US0632,0.006917740008578145
+5001800US0637,5001800US0643,0.0004730865306558231
+5001800US0638,5001800US0638,0.6887156260676834
+5001800US0638,5001800US0645,0.153737658674189
+5001800US0638,5001800US0642,0.09233370419364165
+5001800US0638,5001800US0644,0.05784019132887717
+5001800US0638,5001800US0631,0.005413976469351461
+5001800US0638,5001800US0634,0.0011844168586671805
+5001800US0638,5001800US0628,0.0007744264075900797
+5001800US0639,5001800US0645,0.35148980358066995
+5001800US0639,5001800US0638,0.3314724177470801
+5001800US0639,5001800US0640,0.20141183792614742
+5001800US0639,5001800US0646,0.10893543902612272
+5001800US0639,5001800US0635,0.005736648593066638
+5001800US0639,5001800US0631,0.0009538531269132028
+5001800US0640,5001800US0642,0.4998035349610858
+5001800US0640,5001800US0637,0.2331520216516807
+5001800US0640,5001800US0634,0.15496950827355685
+5001800US0640,5001800US0644,0.09446356234285
+5001800US0640,5001800US0638,0.015750681460572197
+5001800US0640,5001800US0643,0.001836023054247257
+5001800US0640,5001800US0628,2.4668256007160842e-05
+5001800US0641,5001800US0639,0.9629794917777905
+5001800US0641,5001800US0641,0.03549452538231834
+5001800US0641,5001800US0625,0.0015259828398910835
+5001800US0642,5001800US0641,0.7325250092756586
+5001800US0642,5001800US0648,0.14663007740216222
+5001800US0642,5001800US0635,0.03994483608336192
+5001800US0642,5001800US0639,0.03684828289475219
+5001800US0642,5001800US0640,0.023671013975325582
+5001800US0642,5001800US0625,0.020380780368739512
+5001800US0643,5001800US0643,0.7911076363450807
+5001800US0643,5001800US0636,0.1290098010271214
+5001800US0643,5001800US0637,0.05355938751781425
+5001800US0643,5001800US0644,0.026323175109983634
+5001800US0644,5001800US0644,0.6706043411251857
+5001800US0644,5001800US0643,0.27499939117093486
+5001800US0644,5001800US0642,0.05439626770387946
+5001800US0645,5001800US0640,0.8173764937841915
+5001800US0645,5001800US0647,0.1781247975118253
+5001800US0645,5001800US0649,0.003175014579148578
+5001800US0645,5001800US0646,0.0013236941248345381
+5001800US0646,5001800US0646,0.8446515657194924
+5001800US0646,5001800US0645,0.08557874500843862
+5001800US0646,5001800US0640,0.069769689272069
+5001800US0646,5001800US0647,0.0
+5001800US0647,5001800US0642,0.41273816684764353
+5001800US0647,5001800US0645,0.3861931582935575
+5001800US0647,5001800US0644,0.13927145747564623
+5001800US0647,5001800US0646,0.0574975197056457
+5001800US0647,5001800US0647,0.00429969767750705
+5001800US0647,5001800US0636,0.0
+5001800US0648,5001800US0647,0.6061644354137836
+5001800US0648,5001800US0645,0.17512332201809025
+5001800US0648,5001800US0649,0.0871521016978921
+5001800US0648,5001800US0640,0.07501824469389795
+5001800US0648,5001800US0646,0.0565418961763361
+5001800US0649,5001800US0649,0.9467849127978065
+5001800US0649,5001800US0650,0.049865068699184355
+5001800US0649,5001800US0640,0.0033283775566073054
+5001800US0649,5001800US0647,2.1640946401868047e-05
+5001800US0649,5001800US0648,0.0
+5001800US0650,5001800US0648,0.7026490283613446
+5001800US0650,5001800US0650,0.22277524290966386
+5001800US0650,5001800US0651,0.07349453453256302
+5001800US0650,5001800US0649,0.0010811941964285715
+5001800US0650,5001800US0652,0.0
+5001800US0651,5001800US0652,0.8287402679957188
+5001800US0651,5001800US0625,0.1625106624192056
+5001800US0651,5001800US0648,0.008269147531661273
+5001800US0651,5001800US0651,0.0003531501902483461
+5001800US0651,5001800US0650,0.00012677186316607296
+5001800US0652,5001800US0651,0.4962193959357763
+5001800US0652,5001800US0650,0.411860889626181
+5001800US0652,5001800US0648,0.09191773246998805
+5001800US0652,5001800US0652,1.981968054638895e-06
+5001800US0653,5001800US0651,0.596359860966912
+5001800US0653,5001800US0652,0.24744097864849005
+5001800US0653,5001800US0650,0.12459892565341037
+5001800US0653,5001800US0648,0.03160023473118765
+5001800US0801,5001800US0801,0.8051210712426631
+5001800US0801,5001800US0806,0.1736981924575039
+5001800US0801,5001800US0807,0.021180736299833067
+5001800US0802,5001800US0802,0.6336205761200139
+5001800US0802,5001800US0807,0.25830170826047577
+5001800US0802,5001800US0804,0.09109034065927386
+5001800US0802,5001800US0808,0.014904476408337267
+5001800US0802,5001800US0806,0.0020828985518992265
+5001800US0803,5001800US0803,0.9294525872910616
+5001800US0803,5001800US0802,0.06644105519096945
+5001800US0803,5001800US0807,0.004106357517968887
+5001800US0804,5001800US0804,0.5029337358528304
+5001800US0804,5001800US0808,0.291207855380495
+5001800US0804,5001800US0802,0.15484187515332773
+5001800US0804,5001800US0803,0.050532135150296344
+5001800US0804,5001800US0806,0.000484398463050554
+5001800US0804,5001800US0807,0.0
+5001800US0805,5001800US0805,0.8312852988063479
+5001800US0805,5001800US0807,0.16231261698785063
+5001800US0805,5001800US0804,0.006402084205801441
+5001800US0806,5001800US0806,0.8110107505768004
+5001800US0806,5001800US0804,0.14431790290118304
+5001800US0806,5001800US0808,0.04467134652201659
+5001800US0807,5001800US0807,0.5448955821367417
+5001800US0807,5001800US0808,0.4551044178632583
+5001800US0807,5001800US0801,0.0
+5001800US0901,5001800US0901,0.9672058926598738
+5001800US0901,5001800US0902,0.03089007332450013
+5001800US0901,5001800US0905,0.001904034015626095
+5001800US0902,5001800US0902,1.0
+5001800US0903,5001800US0903,0.9781755317954713
+5001800US0903,5001800US0901,0.019992090206190017
+5001800US0903,5001800US0905,0.0018323779983385978
+5001800US0904,5001800US0904,0.9696486259458347
+5001800US0904,5001800US0903,0.030351374054165325
+5001800US0905,5001800US0905,0.9878146024980679
+5001800US0905,5001800US0901,0.007909204505424981
+5001800US0905,5001800US0903,0.004276192996507192
+5001800US1000,5001800US1000,1.0
+5001800US1198,5001800US1198,1.0
+5001800US1201,5001800US1201,0.9747987495611197
+5001800US1201,5001800US1202,0.025201250438880357
+5001800US1202,5001800US1202,0.6993820113510333
+5001800US1202,5001800US1203,0.3006179886489667
+5001800US1203,5001800US1203,0.5792179712439726
+5001800US1203,5001800US1206,0.2533873174390422
+5001800US1203,5001800US1204,0.16739471131698533
+5001800US1204,5001800US1205,0.7624656298645562
+5001800US1204,5001800US1204,0.2245951961825509
+5001800US1204,5001800US1206,0.01293917395289291
+5001800US1205,5001800US1204,0.39587483354951214
+5001800US1205,5001800US1202,0.36349765080899793
+5001800US1205,5001800US1205,0.15146400892627834
+5001800US1205,5001800US1203,0.08916350671521157
+5001800US1206,5001800US1206,0.6101570207856056
+5001800US1206,5001800US1207,0.38717161920369225
+5001800US1206,5001800US1211,0.00237036169963713
+5001800US1206,5001800US1205,0.00030099831106503237
+5001800US1207,5001800US1207,0.7243416431150246
+5001800US1207,5001800US1210,0.25373695130119417
+5001800US1207,5001800US1209,0.021921405583781223
+5001800US1208,5001800US1208,1.0
+5001800US1209,5001800US1218,0.5605197858168995
+5001800US1209,5001800US1209,0.3590995167820295
+5001800US1209,5001800US1210,0.07936855165208306
+5001800US1209,5001800US1211,0.0010121457489878543
+5001800US1210,5001800US1210,0.5368456034843628
+5001800US1210,5001800US1211,0.2794200823797457
+5001800US1210,5001800US1209,0.18373431413589142
+5001800US1211,5001800US1212,0.5453889102810936
+5001800US1211,5001800US1211,0.2894787507853191
+5001800US1211,5001800US1206,0.13233133372514846
+5001800US1211,5001800US1203,0.032801005208438885
+5001800US1212,5001800US1212,0.5772534157521615
+5001800US1212,5001800US1213,0.23956926250122584
+5001800US1212,5001800US1215,0.17647868298593206
+5001800US1212,5001800US1214,0.006698638760680651
+5001800US1213,5001800US1213,0.7453819831827065
+5001800US1213,5001800US1214,0.2546180168172935
+5001800US1214,5001800US1214,0.6632249170480902
+5001800US1214,5001800US1215,0.3367750829519099
+5001800US1215,5001800US1215,0.5084681430291608
+5001800US1215,5001800US1216,0.19446251879108797
+5001800US1215,5001800US1218,0.17809650953203646
+5001800US1215,5001800US1211,0.060903692402043276
+5001800US1215,5001800US1214,0.05644244504775367
+5001800US1215,5001800US1209,0.0016266911979178353
+5001800US1216,5001800US1216,0.6709346399186465
+5001800US1216,5001800US1217,0.2958556747095621
+5001800US1216,5001800US1214,0.033209685371791316
+5001800US1217,5001800US1217,0.5462582449029376
+5001800US1217,5001800US1218,0.4537417550970623
+5001800US1218,5001800US1221,0.95046338878938
+5001800US1218,5001800US1220,0.04953661121062002
+5001800US1218,5001800US1222,0.0
+5001800US1219,5001800US1219,0.8897051787745729
+5001800US1219,5001800US1217,0.09399619379737818
+5001800US1219,5001800US1226,0.01629862742804895
+5001800US1220,5001800US1220,0.8047201512884832
+5001800US1220,5001800US1222,0.07604962558569756
+5001800US1220,5001800US1224,0.07049154352527683
+5001800US1220,5001800US1221,0.0194666628254647
+5001800US1220,5001800US1223,0.016768904372591105
+5001800US1220,5001800US1225,0.01250311240248663
+5001800US1221,5001800US1222,0.887138729405424
+5001800US1221,5001800US1223,0.10680921618971338
+5001800US1221,5001800US1220,0.006052054404862603
+5001800US1222,5001800US1223,0.86342693421676
+5001800US1222,5001800US1220,0.09376805885037684
+5001800US1222,5001800US1225,0.0410140895909871
+5001800US1222,5001800US1222,0.0017909173418760554
+5001800US1223,5001800US1225,0.7424736452134326
+5001800US1223,5001800US1220,0.1538560090194546
+5001800US1223,5001800US1224,0.10367034576711273
+5001800US1223,5001800US1227,0.0
+5001800US1224,5001800US1224,0.7304388158425426
+5001800US1224,5001800US1226,0.14046409077464977
+5001800US1224,5001800US1225,0.11044786821184337
+5001800US1224,5001800US1227,0.0186492251709643
+5001800US1225,5001800US1226,0.6776329968288068
+5001800US1225,5001800US1227,0.11529185581505136
+5001800US1225,5001800US1228,0.10399597699773193
+5001800US1225,5001800US1218,0.08066530057916181
+5001800US1225,5001800US1219,0.020929190370793552
+5001800US1225,5001800US1224,0.0014846794084544623
+5001800US1226,5001800US1228,0.8915866534230953
+5001800US1226,5001800US1227,0.10841334657690468
+5001800US1227,5001800US1227,0.7776326837521148
+5001800US1227,5001800US1224,0.20028071934331726
+5001800US1227,5001800US1226,0.022086596904567955
+5001800US1227,5001800US1228,0.0
+5001800US1301,5001800US1301,0.9561881617774692
+5001800US1301,5001800US1308,0.0422120002314413
+5001800US1301,5001800US1312,0.00159983799108951
+5001800US1302,5001800US1302,0.9535883881372637
+5001800US1302,5001800US1308,0.0425193208077129
+5001800US1302,5001800US1303,0.00389229105502341
+5001800US1303,5001800US1303,0.9356156702384453
+5001800US1303,5001800US1302,0.03246398019680113
+5001800US1303,5001800US1310,0.02951333091719719
+5001800US1303,5001800US1306,0.0024070186475563973
+5001800US1304,5001800US1304,0.5292481140839385
+5001800US1304,5001800US1313,0.34044250327086667
+5001800US1304,5001800US1305,0.12428806972660905
+5001800US1304,5001800US1310,0.0060213129185858185
+5001800US1305,5001800US1305,0.8409159109359058
+5001800US1305,5001800US1306,0.09164795107729144
+5001800US1305,5001800US1304,0.054169391087904833
+5001800US1305,5001800US1313,0.013266746898897912
+5001800US1306,5001800US1311,0.35373415889203297
+5001800US1306,5001800US1304,0.2627546017426386
+5001800US1306,5001800US1307,0.2618954230054446
+5001800US1306,5001800US1305,0.07814324003772916
+5001800US1306,5001800US1306,0.04347257632215467
+5001800US1307,5001800US1307,0.5697457009305716
+5001800US1307,5001800US1304,0.2169990136798319
+5001800US1307,5001800US1313,0.11428448904326943
+5001800US1307,5001800US1309,0.09897079634632704
+5001800US1308,5001800US1308,0.8445285294339339
+5001800US1308,5001800US1302,0.15547147056606617
+5001800US1309,5001800US1309,0.5130770554046914
+5001800US1309,5001800US1307,0.28956600272784244
+5001800US1309,5001800US1310,0.19105955789292603
+5001800US1309,5001800US1311,0.006297383974540137
+5001800US1310,5001800US1310,0.6874925073082563
+5001800US1310,5001800US1312,0.2771235441123581
+5001800US1310,5001800US1308,0.03511959104504222
+5001800US1310,5001800US1313,0.00026435753434342503
+5001800US1311,5001800US1311,0.5762284303464563
+5001800US1311,5001800US1306,0.34235702976142557
+5001800US1311,5001800US1314,0.08141453989211819
+5001800US1312,5001800US1312,0.9080261599572386
+5001800US1312,5001800US1308,0.0680724848815151
+5001800US1312,5001800US1301,0.02390135516124637
+5001800US1313,5001800US1306,0.4755343460142492
+5001800US1313,5001800US1305,0.21698908699849687
+5001800US1313,5001800US1313,0.1867369758887436
+5001800US1313,5001800US1303,0.1207395910985103
+5001800US1314,5001800US1314,0.9248507449920454
+5001800US1314,5001800US1311,0.055233761568332276
+5001800US1314,5001800US1303,0.019915493439622286
+5001800US1501,5001800US1501,0.9875087703991304
+5001800US1501,5001800US1502,0.012491229600869588
+5001800US1502,5001800US1502,1.0
+5001800US1502,5001800US1501,0.0
+5001800US1601,5001800US1601,0.9915164906448278
+5001800US1601,5001800US1602,0.008483509355172204
+5001800US1602,5001800US1602,0.9999813547194701
+5001800US1602,5001800US1601,1.8645280529898873e-05
+5001800US1701,5001800US1701,0.7328664399960247
+5001800US1701,5001800US1706,0.16391961524214507
+5001800US1701,5001800US1707,0.05042568505630125
+5001800US1701,5001800US1702,0.0340563102878155
+5001800US1701,5001800US1704,0.014863591053235477
+5001800US1701,5001800US1714,0.0038683583644779597
+5001800US1702,5001800US1702,0.866871509273929
+5001800US1702,5001800US1701,0.13312849072607102
+5001800US1703,5001800US1706,0.42346040790348827
+5001800US1703,5001800US1704,0.38180285318279644
+5001800US1703,5001800US1701,0.08662255730722768
+5001800US1703,5001800US1714,0.05225556482250935
+5001800US1703,5001800US1707,0.03742106541145526
+5001800US1703,5001800US1711,0.018437551372522995
+5001800US1704,5001800US1704,0.5137089622251435
+5001800US1704,5001800US1703,0.4625259850064018
+5001800US1704,5001800US1707,0.022781258287399162
+5001800US1704,5001800US1705,0.00098379448105554
+5001800US1705,5001800US1705,0.5793785972791489
+5001800US1705,5001800US1703,0.20435952079881398
+5001800US1705,5001800US1704,0.09853002092962414
+5001800US1705,5001800US1707,0.05321520449986919
+5001800US1705,5001800US1706,0.037086040376733236
+5001800US1705,5001800US1709,0.017764291008982296
+5001800US1705,5001800US1708,0.00966632510682829
+5001800US1706,5001800US1706,0.2649640408149341
+5001800US1706,5001800US1708,0.17813077219242998
+5001800US1706,5001800US1703,0.16607018985994842
+5001800US1706,5001800US1709,0.14958111245225883
+5001800US1706,5001800US1705,0.14925295544375114
+5001800US1706,5001800US1711,0.07081855679146921
+5001800US1706,5001800US1704,0.02099717492555547
+5001800US1706,5001800US1710,0.00018519751965286838
+5001800US1707,5001800US1707,0.8356214674918557
+5001800US1707,5001800US1704,0.05429995574161812
+5001800US1707,5001800US1705,0.04485768390808768
+5001800US1707,5001800US1703,0.03863103746000421
+5001800US1707,5001800US1701,0.020530084816472824
+5001800US1707,5001800US1702,0.006059770581961444
+5001800US1708,5001800US1708,0.6857132588595797
+5001800US1708,5001800US1703,0.170406761704302
+5001800US1708,5001800US1706,0.09747842433163478
+5001800US1708,5001800US1709,0.025812671788885817
+5001800US1708,5001800US1705,0.013671231035835221
+5001800US1708,5001800US1710,0.006917652279762546
+5001800US1708,5001800US1704,0.0
+5001800US1709,5001800US1709,0.7062608912123475
+5001800US1709,5001800US1705,0.1787306724199928
+5001800US1709,5001800US1708,0.06752966558791801
+5001800US1709,5001800US1710,0.04747877077974166
+5001800US1710,5001800US1710,0.8249960175825701
+5001800US1710,5001800US1709,0.11204077995448215
+5001800US1710,5001800US1705,0.05610713570343687
+5001800US1710,5001800US1708,0.0068560667595108805
+5001800US1711,5001800US1711,0.5593862773123179
+5001800US1711,5001800US1714,0.32027369568135694
+5001800US1711,5001800US1706,0.08971903945924177
+5001800US1711,5001800US1701,0.030620987547083395
+5001800US1712,5001800US1712,0.5961217321481562
+5001800US1712,5001800US1713,0.3996578683491372
+5001800US1712,5001800US1715,0.004220399502706603
+5001800US1713,5001800US1713,0.5947948448680809
+5001800US1713,5001800US1715,0.33531142019670085
+5001800US1713,5001800US1717,0.059265515333313916
+5001800US1713,5001800US1716,0.010628219601904309
+5001800US1714,5001800US1711,0.44987551317288366
+5001800US1714,5001800US1714,0.21123631670458598
+5001800US1714,5001800US1710,0.20862351333697274
+5001800US1714,5001800US1716,0.04576508120832689
+5001800US1714,5001800US1703,0.04057103006932764
+5001800US1714,5001800US1709,0.022025806166594615
+5001800US1714,5001800US1708,0.021852250387344944
+5001800US1714,5001800US1701,5.0488953963540664e-05
+5001800US1715,5001800US1712,0.5253890530153864
+5001800US1715,5001800US1715,0.3359293734830502
+5001800US1715,5001800US1702,0.1347890821973043
+5001800US1715,5001800US1713,0.003779529041499511
+5001800US1715,5001800US1716,0.0001129622627596367
+5001800US1716,5001800US1716,0.4820501673237491
+5001800US1716,5001800US1714,0.2313222345126926
+5001800US1716,5001800US1717,0.10048715840624348
+5001800US1716,5001800US1702,0.09952826798445441
+5001800US1716,5001800US1701,0.045711200436146134
+5001800US1716,5001800US1711,0.04090097133671428
+5001800US1717,5001800US1717,0.8136161411561997
+5001800US1717,5001800US1716,0.14275098070808273
+5001800US1717,5001800US1715,0.04363287813571751
+5001800US1718,5001800US1715,0.43658111095457497
+5001800US1718,5001800US1716,0.34141233887645295
+5001800US1718,5001800US1717,0.1454151426131452
+5001800US1718,5001800US1713,0.07659140755582688
+5001800US1801,5001800US1801,0.988531203859671
+5001800US1801,5001800US1802,0.011468796140328988
+5001800US1802,5001800US1802,0.9614751108212379
+5001800US1802,5001800US1801,0.030442216351061046
+5001800US1802,5001800US1803,0.008082672827701072
+5001800US1803,5001800US1803,0.9597306037408526
+5001800US1803,5001800US1802,0.040269396259147425
+5001800US1804,5001800US1804,0.8313791776657714
+5001800US1804,5001800US1805,0.12645125968862403
+5001800US1804,5001800US1802,0.023428810986406735
+5001800US1804,5001800US1808,0.018740751659197804
+5001800US1805,5001800US1805,0.6400148982319304
+5001800US1805,5001800US1807,0.3104134259360691
+5001800US1805,5001800US1804,0.03134544179830423
+5001800US1805,5001800US1803,0.018226234033696297
+5001800US1806,5001800US1806,0.5011702888227366
+5001800US1806,5001800US1809,0.32791859893966063
+5001800US1806,5001800US1805,0.1407964956011474
+5001800US1806,5001800US1803,0.030114616636455326
+5001800US1807,5001800US1807,0.8034097347630396
+5001800US1807,5001800US1806,0.19659026523696044
+5001800US1808,5001800US1808,1.0
+5001800US1809,5001800US1809,0.7601494094636573
+5001800US1809,5001800US1806,0.15824668034071387
+5001800US1809,5001800US1808,0.051898563431577525
+5001800US1809,5001800US1804,0.029705346764051274
+5001800US1901,5001800US1902,0.8720655061774997
+5001800US1901,5001800US1901,0.07558665679428175
+5001800US1901,5001800US1904,0.05234783702821857
+5001800US1902,5001800US1901,0.8463125152936964
+5001800US1902,5001800US1903,0.15368748470630356
+5001800US1903,5001800US1903,0.7723673464179066
+5001800US1903,5001800US1904,0.18090719991832777
+5001800US1903,5001800US1901,0.0467254536637656
+5001800US1904,5001800US1904,0.8253842778468936
+5001800US1904,5001800US1902,0.1635109839452142
+5001800US1904,5001800US1903,0.011104738207892137
+5001800US2001,5001800US2001,0.8843389638576956
+5001800US2001,5001800US2002,0.11097291686326319
+5001800US2001,5001800US2004,0.004688119279041238
+5001800US2002,5001800US2002,0.7700740811652688
+5001800US2002,5001800US2001,0.15106556109486954
+5001800US2002,5001800US2003,0.0788603577398617
+5001800US2003,5001800US2003,0.8321204268744993
+5001800US2003,5001800US2002,0.1678795731255007
+5001800US2004,5001800US2004,1.0
+5001800US2101,5001800US2101,0.9002669684401693
+5001800US2101,5001800US2102,0.09973303155983068
+5001800US2102,5001800US2102,0.8524362905311212
+5001800US2102,5001800US2101,0.06378432398065823
+5001800US2102,5001800US2106,0.06038959650183638
+5001800US2102,5001800US2104,0.02338978898638415
+5001800US2103,5001800US2103,0.9897572842907433
+5001800US2103,5001800US2102,0.010242715709256708
+5001800US2104,5001800US2104,0.9288639599361822
+5001800US2104,5001800US2105,0.05473830922861576
+5001800US2104,5001800US2102,0.012965229702017368
+5001800US2104,5001800US2103,0.0034325011331846384
+5001800US2105,5001800US2105,0.9925275551511651
+5001800US2105,5001800US2104,0.007472444848834865
+5001800US2106,5001800US2106,0.9323193563898508
+5001800US2106,5001800US2105,0.030919473416900174
+5001800US2106,5001800US2101,0.02025572148334682
+5001800US2106,5001800US2104,0.016505448709902303
+5001800US2201,5001800US2201,0.8325783943568084
+5001800US2201,5001800US2202,0.07545747116388095
+5001800US2201,5001800US2203,0.05912838698044444
+5001800US2201,5001800US2205,0.032835747498866225
+5001800US2202,5001800US2202,0.8091613356906828
+5001800US2202,5001800US2206,0.15022270954110004
+5001800US2202,5001800US2201,0.03794810609405958
+5001800US2202,5001800US2205,0.002667848674157528
+5001800US2203,5001800US2203,0.8182390505997069
+5001800US2203,5001800US2204,0.11466251786457297
+5001800US2203,5001800US2206,0.06709843153572011
+5001800US2204,5001800US2204,0.6867822356979522
+5001800US2204,5001800US2206,0.3132177643020479
+5001800US2205,5001800US2205,0.5912365355773487
+5001800US2205,5001800US2204,0.22153779067846815
+5001800US2205,5001800US2206,0.18722567374418314
+5001800US2206,5001800US2206,0.4288051906586389
+5001800US2206,5001800US2205,0.2877383232986681
+5001800US2206,5001800US2203,0.11320356686735997
+5001800US2206,5001800US2202,0.11290993887976647
+5001800US2206,5001800US2201,0.057342980295566504
+5001800US2301,5001800US2301,0.941166511470414
+5001800US2301,5001800US2302,0.05883348852958605
+5001800US2302,5001800US2302,0.9799833209581671
+5001800US2302,5001800US2301,0.02001667904183294
+5001800US2401,5001800US2401,0.8710774342335513
+5001800US2401,5001800US2402,0.12892256576644867
+5001800US2402,5001800US2402,0.4366817998188828
+5001800US2402,5001800US2407,0.26050547631884446
+5001800US2402,5001800US2401,0.14073831560957367
+5001800US2402,5001800US2403,0.13744035971147392
+5001800US2402,5001800US2405,0.0246340485412252
+5001800US2403,5001800US2403,0.3098054988636244
+5001800US2403,5001800US2407,0.2845048950213751
+5001800US2403,5001800US2402,0.22175483091363193
+5001800US2403,5001800US2408,0.11619308900964953
+5001800US2403,5001800US2404,0.03769359660246525
+5001800US2403,5001800US2405,0.030048089589253785
+5001800US2404,5001800US2404,0.7260962321123802
+5001800US2404,5001800US2405,0.14302711041092295
+5001800US2404,5001800US2403,0.1308471182880399
+5001800US2404,5001800US2408,2.9539188656951555e-05
+5001800US2405,5001800US2405,0.6828384316140046
+5001800US2405,5001800US2404,0.31716156838599535
+5001800US2405,5001800US2403,0.0
+5001800US2406,5001800US2406,0.763907930400518
+5001800US2406,5001800US2408,0.23609206959948204
+5001800US2407,5001800US2407,0.6091312221019334
+5001800US2407,5001800US2403,0.22031060524784374
+5001800US2407,5001800US2402,0.14582256423260606
+5001800US2407,5001800US2401,0.024735608417616784
+5001800US2408,5001800US2408,0.6534096708918524
+5001800US2408,5001800US2406,0.19547588882017042
+5001800US2408,5001800US2402,0.12317537423217105
+5001800US2408,5001800US2403,0.027939066055806077
+5001800US2408,5001800US2404,0.0
+5001800US2501,5001800US2501,0.9788216495205518
+5001800US2501,5001800US2502,0.021178350479448205
+5001800US2502,5001800US2502,0.8937180011792643
+5001800US2502,5001800US2501,0.07845007297906001
+5001800US2502,5001800US2504,0.027831925841675716
+5001800US2503,5001800US2503,0.9435368111458736
+5001800US2503,5001800US2506,0.03571847060615648
+5001800US2503,5001800US2505,0.01977800822582346
+5001800US2503,5001800US2502,0.0009667100221464477
+5001800US2504,5001800US2504,0.9128553617716668
+5001800US2504,5001800US2505,0.03787473233404711
+5001800US2504,5001800US2508,0.03331384537360532
+5001800US2504,5001800US2502,0.015862729629212216
+5001800US2504,5001800US2509,9.333089146849995e-05
+5001800US2505,5001800US2505,0.9431400489978259
+5001800US2505,5001800US2502,0.04449642731279783
+5001800US2505,5001800US2504,0.007218581718941797
+5001800US2505,5001800US2507,0.0051449419704344595
+5001800US2505,5001800US2508,0.0
+5001800US2506,5001800US2506,0.9606618898017568
+5001800US2506,5001800US2503,0.03818849125728009
+5001800US2506,5001800US2505,0.0011496189409631033
+5001800US2507,5001800US2507,0.9583135106287732
+5001800US2507,5001800US2508,0.03649893879041418
+5001800US2507,5001800US2505,0.005187550580812559
+5001800US2508,5001800US2508,0.944787405218379
+5001800US2508,5001800US2509,0.04152162405656574
+5001800US2508,5001800US2507,0.00965431657324345
+5001800US2508,5001800US2504,0.004036654151811849
+5001800US2508,5001800US2505,0.0
+5001800US2509,5001800US2509,0.9300874102769248
+5001800US2509,5001800US2504,0.06991258972307526
+5001800US2509,5001800US2508,0.0
+5001800US2601,5001800US2601,0.9404095360933812
+5001800US2601,5001800US2602,0.05959046390661885
+5001800US2602,5001800US2603,0.486529033934378
+5001800US2602,5001800US2602,0.3071689764664076
+5001800US2602,5001800US2604,0.2063019895992144
+5001800US2603,5001800US2603,0.5487075481609025
+5001800US2603,5001800US2602,0.23468724570740748
+5001800US2603,5001800US2604,0.13520842866987057
+5001800US2603,5001800US2605,0.08139677746181939
+5001800US2604,5001800US2602,0.48116656595311563
+5001800US2604,5001800US2607,0.23333636365256855
+5001800US2604,5001800US2608,0.19860177505405555
+5001800US2604,5001800US2601,0.08689529534026029
+5001800US2605,5001800US2608,0.9182020676890162
+5001800US2605,5001800US2601,0.05437712451879429
+5001800US2605,5001800US2609,0.027063642313728525
+5001800US2605,5001800US2607,0.00035716547846098627
+5001800US2606,5001800US2604,0.6152097283555882
+5001800US2606,5001800US2605,0.3847902716444118
+5001800US2607,5001800US2605,0.709607595652092
+5001800US2607,5001800US2607,0.17329276792946893
+5001800US2607,5001800US2606,0.10808356724375036
+5001800US2607,5001800US2602,0.009016069174688696
+5001800US2608,5001800US2607,0.6547975192781864
+5001800US2608,5001800US2609,0.23932954941294593
+5001800US2608,5001800US2610,0.10573974277537583
+5001800US2608,5001800US2611,0.00013318853349179353
+5001800US2609,5001800US2610,0.7095396396101716
+5001800US2609,5001800US2611,0.26760158451704424
+5001800US2609,5001800US2612,0.022858775872784145
+5001800US2610,5001800US2609,0.7811212928925769
+5001800US2610,5001800US2610,0.21887870710742313
+5001800US2611,5001800US2611,0.5101384973032931
+5001800US2611,5001800US2606,0.27019631562775975
+5001800US2611,5001800US2612,0.1290129337582104
+5001800US2611,5001800US2607,0.04340239196263452
+5001800US2611,5001800US2609,0.03674516844031591
+5001800US2611,5001800US2610,0.010504692907786342
+5001800US2612,5001800US2606,0.5760270363496696
+5001800US2612,5001800US2613,0.27983553028363195
+5001800US2612,5001800US2612,0.1441374333666985
+5001800US2613,5001800US2612,0.6246669596506167
+5001800US2613,5001800US2613,0.37533304034938336
+5001800US2614,5001800US2613,0.41190846300540057
+5001800US2614,5001800US2611,0.36712753000753195
+5001800US2614,5001800US2612,0.2209640069870675
+5001800US2701,5001800US2701,0.9416988690444315
+5001800US2701,5001800US2702,0.04513284884373742
+5001800US2701,5001800US2707,0.01316828211183107
+5001800US2702,5001800US2702,0.8406623603360881
+5001800US2702,5001800US2701,0.15933763966391196
+5001800US2703,5001800US2703,0.9515062148633415
+5001800US2703,5001800US2706,0.04849378513665852
+5001800US2703,5001800US2705,0.0
+5001800US2704,5001800US2704,0.9872695337649051
+5001800US2704,5001800US2702,0.010586497371606685
+5001800US2704,5001800US2708,0.0021439688634881477
+5001800US2705,5001800US2705,0.9848856987648165
+5001800US2705,5001800US2703,0.015114301235183586
+5001800US2706,5001800US2706,0.8292238057785354
+5001800US2706,5001800US2708,0.07227300687658564
+5001800US2706,5001800US2707,0.052240016403426355
+5001800US2706,5001800US2703,0.046263170941452623
+5001800US2707,5001800US2707,0.9425849699683243
+5001800US2707,5001800US2708,0.05734855249659078
+5001800US2707,5001800US2706,6.647753508486665e-05
+5001800US2708,5001800US2708,0.870002069399742
+5001800US2708,5001800US2707,0.12999793060025802
+5001800US2801,5001800US2801,0.9572797698899
+5001800US2801,5001800US2803,0.04272023011009995
+5001800US2802,5001800US2802,0.9960763018430357
+5001800US2802,5001800US2803,0.003923698156964258
+5001800US2803,5001800US2803,0.8655147881482269
+5001800US2803,5001800US2802,0.13448521185177312
+5001800US2804,5001800US2804,0.8733773563607631
+5001800US2804,5001800US2803,0.12662264363923692
+5001800US2901,5001800US2901,0.9769804108589307
+5001800US2901,5001800US2902,0.023019589141069274
+5001800US2902,5001800US2902,0.6755620222021533
+5001800US2902,5001800US2903,0.1577787440583039
+5001800US2902,5001800US2901,0.10836278630706166
+5001800US2902,5001800US2908,0.05829644743248115
+5001800US2903,5001800US2903,0.6891206851164442
+5001800US2903,5001800US2902,0.15215873989581244
+5001800US2903,5001800US2908,0.13246115910457637
+5001800US2903,5001800US2904,0.0161691151786814
+5001800US2903,5001800US2906,0.01009030070448557
+5001800US2904,5001800US2904,0.7471718789325862
+5001800US2904,5001800US2903,0.14004405610136914
+5001800US2904,5001800US2906,0.09228840041606065
+5001800US2904,5001800US2907,0.020495664549984076
+5001800US2905,5001800US2905,0.8467785096552989
+5001800US2905,5001800US2904,0.12422438948743988
+5001800US2905,5001800US2906,0.02899710085726119
+5001800US2906,5001800US2906,0.8902455639134994
+5001800US2906,5001800US2905,0.07760777481442337
+5001800US2906,5001800US2904,0.03214666127207722
+5001800US2907,5001800US2907,1.0
+5001800US2908,5001800US2908,0.9008372927864159
+5001800US2908,5001800US2903,0.09916270721358411
+5001800US3000,5001800US3002,0.6106969801944205
+5001800US3000,5001800US3001,0.3893030198055794
+5001800US3101,5001800US3101,0.8532079757052351
+5001800US3101,5001800US3103,0.10288864440171386
+5001800US3101,5001800US3102,0.04390337989305108
+5001800US3102,5001800US3102,0.9112527509181813
+5001800US3102,5001800US3101,0.08874724908181875
+5001800US3103,5001800US3103,1.0
+5001800US3201,5001800US3201,0.5651471999748242
+5001800US3201,5001800US3203,0.24891507875316665
+5001800US3201,5001800US3204,0.1859377212720092
+5001800US3202,5001800US3202,0.9999979321753515
+5001800US3202,5001800US3204,2.0678246484698098e-06
+5001800US3203,5001800US3203,0.596842989599895
+5001800US3203,5001800US3201,0.39709212860480453
+5001800US3203,5001800US3204,0.006064881795300441
+5001800US3204,5001800US3204,0.7958110422121486
+5001800US3204,5001800US3203,0.13791955636236355
+5001800US3204,5001800US3201,0.05101764187104722
+5001800US3204,5001800US3202,0.015251759554440665
+5001800US3301,5001800US3301,0.9951414538029315
+5001800US3301,5001800US3302,0.00485854619706852
+5001800US3302,5001800US3302,1.0
+5001800US3401,5001800US3401,0.9874253792650199
+5001800US3401,5001800US3402,0.012574620734980009
+5001800US3402,5001800US3402,0.9562124416629942
+5001800US3402,5001800US3401,0.04128904730206906
+5001800US3402,5001800US3403,0.002498511034936744
+5001800US3403,5001800US3403,0.5688511147176902
+5001800US3403,5001800US3404,0.3768787904672418
+5001800US3403,5001800US3402,0.05427009481506791
+5001800US3404,5001800US3404,0.5840949485209217
+5001800US3404,5001800US3403,0.3162222700610063
+5001800US3404,5001800US3406,0.09968278141807203
+5001800US3405,5001800US3405,0.8014430283325896
+5001800US3405,5001800US3407,0.1225707742628345
+5001800US3405,5001800US3409,0.0759861974045759
+5001800US3406,5001800US3406,0.9214673469750344
+5001800US3406,5001800US3404,0.038562113672227685
+5001800US3406,5001800US3403,0.03432553046124326
+5001800US3406,5001800US3412,0.005645008891494691
+5001800US3407,5001800US3407,0.7117057460687819
+5001800US3407,5001800US3412,0.12228593963095023
+5001800US3407,5001800US3410,0.10259780804811121
+5001800US3407,5001800US3411,0.06341050625215662
+5001800US3408,5001800US3408,0.8651699383579543
+5001800US3408,5001800US3411,0.05875516886231227
+5001800US3408,5001800US3409,0.03899128402994481
+5001800US3408,5001800US3410,0.03708360874978863
+5001800US3409,5001800US3409,0.8068333136385843
+5001800US3409,5001800US3405,0.1931666863614157
+5001800US3410,5001800US3410,0.7555074254785711
+5001800US3410,5001800US3411,0.13674598648723596
+5001800US3410,5001800US3407,0.05987169842445893
+5001800US3410,5001800US3408,0.04787488960973397
+5001800US3411,5001800US3411,0.7817920628097502
+5001800US3411,5001800US3407,0.08850637200364114
+5001800US3411,5001800US3410,0.08514337008192577
+5001800US3411,5001800US3409,0.02606800596743198
+5001800US3411,5001800US3405,0.018490189137250937
+5001800US3412,5001800US3412,0.8600999399935214
+5001800US3412,5001800US3403,0.07852883231640938
+5001800US3412,5001800US3407,0.04225411947130575
+5001800US3412,5001800US3406,0.019117108218763442
+5001800US3501,5001800US3501,0.7854562273666081
+5001800US3501,5001800US3502,0.2145437726333918
+5001800US3502,5001800US3502,0.653947440544708
+5001800US3502,5001800US3503,0.28274241481788653
+5001800US3502,5001800US3501,0.06331014463740553
+5001800US3503,5001800US3503,0.8652971597638904
+5001800US3503,5001800US3501,0.13295054080006238
+5001800US3503,5001800US3502,0.0017522994360471634
+5001800US3601,5001800US3601,0.7777308735322014
+5001800US3601,5001800US3602,0.2222691264677986
+5001800US3602,5001800US3602,0.8105130028090965
+5001800US3602,5001800US3603,0.13333430807948812
+5001800US3602,5001800US3604,0.05615268911141531
+5001800US3603,5001800US3603,0.7886431433183732
+5001800US3603,5001800US3601,0.21122219916322235
+5001800US3603,5001800US3614,0.00012990490010788444
+5001800US3603,5001800US3606,4.752618296629918e-06
+5001800US3604,5001800US3604,0.883769170512824
+5001800US3604,5001800US3603,0.11623082948717599
+5001800US3605,5001800US3605,0.8415889123726904
+5001800US3605,5001800US3604,0.1295387765021246
+5001800US3605,5001800US3603,0.028801614245421273
+5001800US3605,5001800US3606,7.069687976375459e-05
+5001800US3606,5001800US3606,0.7995057816525636
+5001800US3606,5001800US3607,0.08138881766120859
+5001800US3606,5001800US3605,0.06057608947824707
+5001800US3606,5001800US3603,0.05852931120798076
+5001800US3607,5001800US3607,0.551085215662451
+5001800US3607,5001800US3610,0.42588501684431435
+5001800US3607,5001800US3605,0.021521756472328075
+5001800US3607,5001800US3611,0.0008790338911286611
+5001800US3607,5001800US3608,0.000628977129777925
+5001800US3608,5001800US3608,0.7606744331212871
+5001800US3608,5001800US3607,0.12366337258642578
+5001800US3608,5001800US3609,0.050926153220319965
+5001800US3608,5001800US3605,0.0467109462122753
+5001800US3608,5001800US3610,0.01802509485969182
+5001800US3608,5001800US3611,0.0
+5001800US3609,5001800US3609,0.7456206621537796
+5001800US3609,5001800US3608,0.1622446481435032
+5001800US3609,5001800US3610,0.0921346897027172
+5001800US3610,5001800US3612,0.33220618493145404
+5001800US3610,5001800US3610,0.276788397666587
+5001800US3610,5001800US3609,0.20647622276850047
+5001800US3610,5001800US3611,0.1516949609685051
+5001800US3610,5001800US3613,0.03283423366495334
+5001800US3610,5001800US3608,0.0
+5001800US3611,5001800US3611,0.8708431322850461
+5001800US3611,5001800US3608,0.08634092972667742
+5001800US3611,5001800US3609,0.04281593798827655
+5001800US3612,5001800US3612,0.5659346745382134
+5001800US3612,5001800US3607,0.16946860446469458
+5001800US3612,5001800US3610,0.1482147669203262
+5001800US3612,5001800US3614,0.11638195407676577
+5001800US3612,5001800US3613,0.0
+5001800US3613,5001800US3613,0.9412145148753163
+5001800US3613,5001800US3615,0.058413339615789775
+5001800US3613,5001800US3612,0.00037214550889397757
+5001800US3614,5001800US3614,0.5794431989565075
+5001800US3614,5001800US3606,0.20730444707129173
+5001800US3614,5001800US3615,0.12144132393103167
+5001800US3614,5001800US3607,0.09181103004116904
+5001800US3614,5001800US3603,0.0
+5001800US3615,5001800US3615,0.5799387820555455
+5001800US3615,5001800US3614,0.338683622056787
+5001800US3615,5001800US3613,0.08137759588766759
+5001800US3616,5001800US3616,0.7157904756373399
+5001800US3616,5001800US3615,0.28231387998877133
+5001800US3616,5001800US3613,0.0018956443738888298
+5001800US3617,5001800US3617,0.7087298045166474
+5001800US3617,5001800US3616,0.2912701954833527
+5001800US3618,5001800US3618,0.74410854131465
+5001800US3618,5001800US3617,0.25589145868535
+5001800US3619,5001800US3619,0.46916598700882584
+5001800US3619,5001800US3618,0.38997189129358306
+5001800US3619,5001800US3621,0.07942348023652093
+5001800US3619,5001800US3620,0.03867677900439836
+5001800US3619,5001800US3617,0.022761862456671812
+5001800US3620,5001800US3620,0.966826774691358
+5001800US3620,5001800US3619,0.018674768518518518
+5001800US3620,5001800US3621,0.014498456790123456
+5001800US3621,5001800US3621,0.8338834368606118
+5001800US3621,5001800US3624,0.11727619117629041
+5001800US3621,5001800US3620,0.048840371963097734
+5001800US3622,5001800US3619,0.408574246313831
+5001800US3622,5001800US3622,0.31161061676333013
+5001800US3622,5001800US3621,0.23706694854743782
+5001800US3622,5001800US3624,0.024235780587885464
+5001800US3622,5001800US3623,0.01851240778751559
+5001800US3623,5001800US3623,0.7260454178274032
+5001800US3623,5001800US3619,0.1516280509440657
+5001800US3623,5001800US3624,0.1223265312285311
+5001800US3624,5001800US3622,0.7947957839262187
+5001800US3624,5001800US3624,0.20520421607378128
+5001800US3625,5001800US3625,1.0
+5001800US3626,5001800US3626,1.0
+5001800US3626,5001800US3623,0.0
+5001800US3627,5001800US3624,0.4975097244887349
+5001800US3627,5001800US3623,0.4278562834096933
+5001800US3627,5001800US3625,0.04418620936988708
+5001800US3627,5001800US3626,0.03044778273168475
+5001800US3701,5001800US3701,0.521492970029817
+5001800US3701,5001800US3704,0.34909429470483555
+5001800US3701,5001800US3713,0.06587311876112244
+5001800US3701,5001800US3703,0.06353961650422506
+5001800US3702,5001800US3713,0.5915759434296931
+5001800US3702,5001800US3702,0.16855733629744082
+5001800US3702,5001800US3701,0.13473024782356202
+5001800US3702,5001800US3704,0.10513647244930403
+5001800US3703,5001800US3703,0.7100253896113005
+5001800US3703,5001800US3701,0.28997461038869954
+5001800US3704,5001800US3702,0.7875290965786815
+5001800US3704,5001800US3704,0.21247090342131847
+5001800US3704,5001800US3713,0.0
+5001800US3705,5001800US3710,0.4431789962096458
+5001800US3705,5001800US3705,0.3711018167559796
+5001800US3705,5001800US3706,0.15662821853352504
+5001800US3705,5001800US3711,0.02909096850084956
+5001800US3706,5001800US3709,0.49629613747708534
+5001800US3706,5001800US3705,0.31303087802803714
+5001800US3706,5001800US3713,0.17451213716262332
+5001800US3706,5001800US3704,0.012183617242525273
+5001800US3706,5001800US3706,0.003977230089728884
+5001800US3707,5001800US3707,0.7133882206152413
+5001800US3707,5001800US3701,0.22876632486593035
+5001800US3707,5001800US3703,0.03617171459784961
+5001800US3707,5001800US3713,0.021673739920978714
+5001800US3708,5001800US3706,0.3366908737138983
+5001800US3708,5001800US3709,0.3030188382582718
+5001800US3708,5001800US3708,0.21712640402511768
+5001800US3708,5001800US3707,0.14316388400271224
+5001800US3709,5001800US3708,0.7445891182383714
+5001800US3709,5001800US3707,0.13599570539877687
+5001800US3709,5001800US3712,0.09601381009020768
+5001800US3709,5001800US3714,0.022401397850594192
+5001800US3709,5001800US3709,0.00099996842204983
+5001800US3710,5001800US3714,0.44833967154996746
+5001800US3710,5001800US3710,0.38225843236217394
+5001800US3710,5001800US3711,0.16940189608785863
+5001800US3711,5001800US3711,0.7531361162748712
+5001800US3711,5001800US3705,0.1277896704077199
+5001800US3711,5001800US3714,0.11907421331740889
+5001800US3712,5001800US3712,0.873563091049287
+5001800US3712,5001800US3714,0.12120554112861055
+5001800US3712,5001800US3708,0.0052313678221024736
+5001800US3713,5001800US3706,0.515749821393778
+5001800US3713,5001800US3710,0.26001402204558666
+5001800US3713,5001800US3705,0.1766078306962895
+5001800US3713,5001800US3709,0.047628325864345865
+5001800US3800,5001800US3800,1.0
+5001800US3901,5001800US3901,0.6711940673083381
+5001800US3901,5001800US3908,0.3288059326916619
+5001800US3902,5001800US3902,0.524744787922358
+5001800US3902,5001800US3901,0.36714234363767073
+5001800US3902,5001800US3908,0.10811286843997124
+5001800US3903,5001800US3903,0.7221931735657225
+5001800US3903,5001800US3915,0.2778068264342774
+5001800US3904,5001800US3904,0.5038674881480202
+5001800US3904,5001800US3905,0.3343569861103114
+5001800US3904,5001800US3909,0.11068405596821558
+5001800US3904,5001800US3915,0.051091469773452844
+5001800US3905,5001800US3905,0.5013060509523489
+5001800US3905,5001800US3909,0.45066168756797337
+5001800US3905,5001800US3904,0.04803226147967777
+5001800US3906,5001800US3906,0.6051412365923513
+5001800US3906,5001800US3902,0.2667525411355085
+5001800US3906,5001800US3912,0.12810622227214016
+5001800US3907,5001800US3912,0.2310239411946553
+5001800US3907,5001800US3906,0.2139061587606911
+5001800US3907,5001800US3913,0.20307251056229175
+5001800US3907,5001800US3904,0.16239652388967127
+5001800US3907,5001800US3905,0.10656751279497131
+5001800US3907,5001800US3907,0.08303335279771923
+5001800US3908,5001800US3908,0.6526831903336879
+5001800US3908,5001800US3915,0.1947346967181333
+5001800US3908,5001800US3910,0.1255066296822856
+5001800US3908,5001800US3905,0.027075483265893176
+5001800US3909,5001800US3909,0.3935681798678452
+5001800US3909,5001800US3911,0.2864348236505621
+5001800US3909,5001800US3907,0.19962456019909036
+5001800US3909,5001800US3905,0.12037243628250235
+5001800US3910,5001800US3910,0.9608606430757323
+5001800US3910,5001800US3915,0.039139356924267704
+5001800US3911,5001800US3911,0.773182304717457
+5001800US3911,5001800US3913,0.17935194944204988
+5001800US3911,5001800US3907,0.04746574584049311
+5001800US3912,5001800US3904,0.4063692930988564
+5001800US3912,5001800US3912,0.29986602855367334
+5001800US3912,5001800US3903,0.24525248775291145
+5001800US3912,5001800US3915,0.04851219059455882
+5001800US3913,5001800US3914,0.3893675468313271
+5001800US3913,5001800US3913,0.3339660267653348
+5001800US3913,5001800US3906,0.27666642640333805
+5001800US3914,5001800US3914,0.6613176044166261
+5001800US3914,5001800US3913,0.20643028543620895
+5001800US3914,5001800US3907,0.06744416290503918
+5001800US3914,5001800US3911,0.06480794724212575
+5001800US3915,5001800US3912,0.3399033912242208
+5001800US3915,5001800US3902,0.2715417322639428
+5001800US3915,5001800US3915,0.2244335907639035
+5001800US3915,5001800US3903,0.16412128574793292
+5001800US3916,5001800US3907,0.7033591519716375
+5001800US3916,5001800US3913,0.2373600205030363
+5001800US3916,5001800US3914,0.03914620515850698
+5001800US3916,5001800US3906,0.020134622366819253
+5001800US4001,5001800US4001,0.8842488103200041
+5001800US4001,5001800US4002,0.11561434516042611
+5001800US4001,5001800US4003,0.00013684451956973
+5001800US4002,5001800US4002,0.9971427986045532
+5001800US4002,5001800US4001,0.002857201395446731
+5001800US4003,5001800US4003,0.8040516338653957
+5001800US4003,5001800US4005,0.15793542830858967
+5001800US4003,5001800US4001,0.03801293782601462
+5001800US4004,5001800US4004,0.9535074377312492
+5001800US4004,5001800US4003,0.03701929818883963
+5001800US4004,5001800US4005,0.009473264079911115
+5001800US4005,5001800US4005,0.6984205169252319
+5001800US4005,5001800US4003,0.25506467818681733
+5001800US4005,5001800US4004,0.04651480488795079
+5001800US4101,5001800US4101,0.6319223670158587
+5001800US4101,5001800US4106,0.36793770924258046
+5001800US4101,5001800US4105,0.00013992374156084934
+5001800US4101,5001800US4103,0.0
+5001800US4102,5001800US4102,0.9444103440623628
+5001800US4102,5001800US4105,0.032057244573711126
+5001800US4102,5001800US4103,0.023532411363926097
+5001800US4103,5001800US4103,0.820860213343091
+5001800US4103,5001800US4105,0.10341333036360599
+5001800US4103,5001800US4101,0.075726456293303
+5001800US4104,5001800US4104,0.7968913676393032
+5001800US4104,5001800US4105,0.12964253978803794
+5001800US4104,5001800US4102,0.07346609257265878
+5001800US4105,5001800US4106,0.47915371509797267
+5001800US4105,5001800US4105,0.41577139823175385
+5001800US4105,5001800US4104,0.07165227852397482
+5001800US4105,5001800US4101,0.025655027336911048
+5001800US4105,5001800US4103,0.007767580809387633
+5001800US4105,5001800US4102,0.0
+5001800US4201,5001800US4201,0.9704663963944842
+5001800US4201,5001800US4204,0.029533603605515817
+5001800US4202,5001800US4202,0.9980442399551832
+5001800US4202,5001800US4203,0.001955760044816739
+5001800US4203,5001800US4203,0.9651450324415789
+5001800US4203,5001800US4202,0.03485496755842115
+5001800US4204,5001800US4204,0.8132087976394857
+5001800US4204,5001800US4205,0.11333182922229233
+5001800US4204,5001800US4201,0.07345937313822193
+5001800US4205,5001800US4205,0.9352211556887271
+5001800US4205,5001800US4203,0.06179816385903176
+5001800US4205,5001800US4204,0.002980680452241172
+5001800US4206,5001800US4206,0.9999884570322645
+5001800US4206,5001800US4205,1.1542967735481352e-05
+5001800US4207,5001800US4207,0.9560041017339772
+5001800US4207,5001800US4208,0.04399589826602283
+5001800US4208,5001800US4208,1.0
+5001800US4209,5001800US4209,0.8270239211179662
+5001800US4209,5001800US4204,0.09447253130846436
+5001800US4209,5001800US4207,0.05684158093527296
+5001800US4209,5001800US4208,0.014906084412248052
+5001800US4209,5001800US4206,0.006755882226048486
+5001800US4210,5001800US4210,0.9981201533812293
+5001800US4210,5001800US4211,0.0018798466187706694
+5001800US4211,5001800US4211,0.9940133511133313
+5001800US4211,5001800US4210,0.005986648886668627
+5001800US4212,5001800US4209,0.41404264144225494
+5001800US4212,5001800US4215,0.4093864763823965
+5001800US4212,5001800US4213,0.17657088217534858
+5001800US4213,5001800US4213,0.803400098291927
+5001800US4213,5001800US4214,0.16630191669257696
+5001800US4213,5001800US4210,0.030297985015496024
+5001800US4214,5001800US4214,0.8417178002193104
+5001800US4214,5001800US4212,0.15828219978068955
+5001800US4215,5001800US4215,0.6702230151650312
+5001800US4215,5001800US4213,0.146158616881186
+5001800US4215,5001800US4214,0.1102858842020305
+5001800US4215,5001800US4216,0.07333248375175226
+5001800US4216,5001800US4216,1.0
+5001800US4217,5001800US4217,0.9754036675840521
+5001800US4217,5001800US4216,0.02459633241594798
+5001800US4218,5001800US4212,0.9361369308404336
+5001800US4218,5001800US4217,0.06386306915956647
+5001800US4401,5001800US4401,0.9892810219452863
+5001800US4401,5001800US4402,0.010718978054713618
+5001800US4402,5001800US4402,0.9957674643078059
+5001800US4402,5001800US4401,0.004232535692194053
+5001800US4501,5001800US4501,0.8144612879153154
+5001800US4501,5001800US4506,0.18553871208468461
+5001800US4502,5001800US4502,0.9552158458685349
+5001800US4502,5001800US4506,0.044784154131465075
+5001800US4503,5001800US4503,0.9695316285950909
+5001800US4503,5001800US4504,0.030468371404909093
+5001800US4504,5001800US4504,0.9457216266573607
+5001800US4504,5001800US4505,0.04442757407486771
+5001800US4504,5001800US4503,0.00985079926777154
+5001800US4505,5001800US4505,0.9358826486854168
+5001800US4505,5001800US4503,0.04183548733721744
+5001800US4505,5001800US4506,0.022281863977365708
+5001800US4506,5001800US4506,0.8276732845698362
+5001800US4506,5001800US4501,0.12874787012718047
+5001800US4506,5001800US4502,0.03427800841593945
+5001800US4506,5001800US4507,0.008498310222448154
+5001800US4506,5001800US4505,0.0008025266645956301
+5001800US4507,5001800US4507,0.9991042095720993
+5001800US4507,5001800US4506,0.0008957904279006572
+5001800US4600,5001800US4600,1.0
+5001800US4701,5001800US4701,1.0
+5001800US4702,5001800US4702,0.9579557293782648
+5001800US4702,5001800US4701,0.04204427062173518
+5001800US4703,5001800US4703,0.9350354507888885
+5001800US4703,5001800US4702,0.04586915262987514
+5001800US4703,5001800US4706,0.019095396581236365
+5001800US4704,5001800US4704,0.7687311547065842
+5001800US4704,5001800US4705,0.14059506960476245
+5001800US4704,5001800US4703,0.07852941701917332
+5001800US4704,5001800US4706,0.012144358669480102
+5001800US4705,5001800US4705,0.3956991032305144
+5001800US4705,5001800US4707,0.34082255169480097
+5001800US4705,5001800US4706,0.2634783450746847
+5001800US4706,5001800US4706,0.7452178436822422
+5001800US4706,5001800US4707,0.11167944818754638
+5001800US4706,5001800US4705,0.11013147746620294
+5001800US4706,5001800US4704,0.0329712306640085
+5001800US4707,5001800US4707,0.4736478313461206
+5001800US4707,5001800US4705,0.22107563606054137
+5001800US4707,5001800US4708,0.17320363085617865
+5001800US4707,5001800US4704,0.13207290173715935
+5001800US4708,5001800US4708,0.9002346736875378
+5001800US4708,5001800US4709,0.09958526977470926
+5001800US4708,5001800US4707,0.0001800565377528544
+5001800US4709,5001800US4709,0.9920755629102133
+5001800US4709,5001800US4708,0.007924437089786735
+5001800US4801,5001800US4801,0.7893897761480967
+5001800US4801,5001800US4817,0.18520767492239737
+5001800US4801,5001800US4805,0.025402548929505876
+5001800US4802,5001800US4802,0.359408487885635
+5001800US4802,5001800US4838,0.35927425669478064
+5001800US4802,5001800US4818,0.1757354750665563
+5001800US4802,5001800US4807,0.07756325644869012
+5001800US4802,5001800US4808,0.028018523904337903
+5001800US4803,5001800US4803,0.5176007187488897
+5001800US4803,5001800US4804,0.44218792034881654
+5001800US4803,5001800US4832,0.040211360902293804
+5001800US4804,5001800US4804,0.5595153906279079
+5001800US4804,5001800US4801,0.2735209550749991
+5001800US4804,5001800US4803,0.1612014739271225
+5001800US4804,5001800US4805,0.005762180369970596
+5001800US4805,5001800US4805,0.4807568300439261
+5001800US4805,5001800US4832,0.2446883559945106
+5001800US4805,5001800US4806,0.2341394723353597
+5001800US4805,5001800US4824,0.02817444789215227
+5001800US4805,5001800US4830,0.01224089373405129
+5001800US4806,5001800US4825,0.48740579826057345
+5001800US4806,5001800US4806,0.40433282851606944
+5001800US4806,5001800US4833,0.07654560512282023
+5001800US4806,5001800US4830,0.019458254536260412
+5001800US4806,5001800US4812,0.01225751356427648
+5001800US4807,5001800US4838,0.47138783269961976
+5001800US4807,5001800US4807,0.3573855182674822
+5001800US4807,5001800US4809,0.11178293932881468
+5001800US4807,5001800US4808,0.03407174739626385
+5001800US4807,5001800US4822,0.025371962307819473
+5001800US4808,5001800US4802,0.3594753042668233
+5001800US4808,5001800US4817,0.3366527611834511
+5001800US4808,5001800US4808,0.14210931329863713
+5001800US4808,5001800US4810,0.1356544014904518
+5001800US4808,5001800US4838,0.02610821976063668
+5001800US4809,5001800US4809,0.7790925827668356
+5001800US4809,5001800US4807,0.19671033655130174
+5001800US4809,5001800US4822,0.024197080681862694
+5001800US4810,5001800US4810,0.37407784444107517
+5001800US4810,5001800US4838,0.23814680846455413
+5001800US4810,5001800US4837,0.19529121271849076
+5001800US4810,5001800US4822,0.10530442504710953
+5001800US4810,5001800US4808,0.0803092984470099
+5001800US4810,5001800US4802,0.003942039030518313
+5001800US4810,5001800US4827,0.0029283718512421756
+5001800US4810,5001800US4835,0.0
+5001800US4811,5001800US4811,0.7644499701265127
+5001800US4811,5001800US4825,0.1508330739141616
+5001800US4811,5001800US4819,0.08471695595932566
+5001800US4812,5001800US4812,0.8362937978608432
+5001800US4812,5001800US4824,0.11262287629205561
+5001800US4812,5001800US4833,0.022888814480278543
+5001800US4812,5001800US4825,0.0165105446982504
+5001800US4812,5001800US4826,0.011683966668572254
+5001800US4813,5001800US4813,0.8933847583643123
+5001800US4813,5001800US4826,0.07749256505576207
+5001800US4813,5001800US4819,0.016944237918215612
+5001800US4813,5001800US4825,0.012178438661710037
+5001800US4814,5001800US4814,0.9115498013075247
+5001800US4814,5001800US4836,0.08567210934495577
+5001800US4814,5001800US4822,0.002778089347519549
+5001800US4815,5001800US4815,0.5363008110572539
+5001800US4815,5001800US4834,0.2530354657580266
+5001800US4815,5001800US4828,0.2106637231847195
+5001800US4816,5001800US4816,0.9752759215156578
+5001800US4816,5001800US4823,0.024724078484342233
+5001800US4817,5001800US4817,0.7427373353057313
+5001800US4817,5001800US4810,0.23411768768958907
+5001800US4817,5001800US4806,0.014758448405868842
+5001800US4817,5001800US4837,0.008386528598810805
+5001800US4818,5001800US4818,0.7611156199003413
+5001800US4818,5001800US4829,0.14345803771674914
+5001800US4818,5001800US4809,0.05516140010995628
+5001800US4818,5001800US4802,0.035727064080076094
+5001800US4818,5001800US4807,0.004537878192877276
+5001800US4819,5001800US4819,0.9708470161442841
+5001800US4819,5001800US4825,0.029152983855715832
+5001800US4820,5001800US4820,0.8164786861998101
+5001800US4820,5001800US4823,0.12017930752683004
+5001800US4820,5001800US4835,0.030393346873535315
+5001800US4820,5001800US4828,0.016710932711605717
+5001800US4820,5001800US4821,0.016237726688218823
+5001800US4821,5001800US4821,0.7765707422880009
+5001800US4821,5001800US4837,0.10765738483121347
+5001800US4821,5001800US4835,0.06671481004074069
+5001800US4821,5001800US4820,0.04238946436613812
+5001800US4821,5001800US4823,0.005448484001387357
+5001800US4821,5001800US4810,0.0012191144725194514
+5001800US4822,5001800US4822,0.6477084119200301
+5001800US4822,5001800US4836,0.12348642021878536
+5001800US4822,5001800US4809,0.09349302150132026
+5001800US4822,5001800US4807,0.08792436816295737
+5001800US4822,5001800US4829,0.03448698604300264
+5001800US4822,5001800US4814,0.012900792153904187
+5001800US4823,5001800US4823,0.7146581769317389
+5001800US4823,5001800US4820,0.1716762170236079
+5001800US4823,5001800US4828,0.05921161818321338
+5001800US4823,5001800US4816,0.04323269023829197
+5001800US4823,5001800US4821,0.011221297623147853
+5001800US4824,5001800US4824,0.5527587741059345
+5001800US4824,5001800US4832,0.1824468964810402
+5001800US4824,5001800US4833,0.12638159994223194
+5001800US4824,5001800US4826,0.0776724131906835
+5001800US4824,5001800US4806,0.060740316280109855
+5001800US4825,5001800US4831,0.3023374758849657
+5001800US4825,5001800US4825,0.2401274171115797
+5001800US4825,5001800US4837,0.19086245831277013
+5001800US4825,5001800US4806,0.09325377241389624
+5001800US4825,5001800US4835,0.09159376074894941
+5001800US4825,5001800US4811,0.04402769677120254
+5001800US4825,5001800US4821,0.032500336488851006
+5001800US4825,5001800US4810,0.005297082267785305
+5001800US4826,5001800US4826,0.6296421569031847
+5001800US4826,5001800US4824,0.1817924328545776
+5001800US4826,5001800US4813,0.14163029938044994
+5001800US4826,5001800US4804,0.04693511086178781
+5001800US4827,5001800US4827,0.8794279948976444
+5001800US4827,5001800US4822,0.11994515564771459
+5001800US4827,5001800US4810,0.0006268494546409745
+5001800US4828,5001800US4828,0.6541677607453619
+5001800US4828,5001800US4835,0.2067333375587407
+5001800US4828,5001800US4815,0.1198016171237652
+5001800US4828,5001800US4821,0.019297284572132232
+5001800US4829,5001800US4829,0.8676507112362517
+5001800US4829,5001800US4809,0.05858777578215073
+5001800US4829,5001800US4836,0.054501533321307284
+5001800US4829,5001800US4818,0.01707176152866775
+5001800US4829,5001800US4802,0.002188218131622497
+5001800US4830,5001800US4830,0.81055350194274
+5001800US4830,5001800US4832,0.16845001703655557
+5001800US4830,5001800US4833,0.0170451454406455
+5001800US4830,5001800US4805,0.0039513355800589265
+5001800US4831,5001800US4831,0.5523763873013532
+5001800US4831,5001800US4811,0.18186091958261125
+5001800US4831,5001800US4837,0.139949977262392
+5001800US4831,5001800US4817,0.07882268352936839
+5001800US4831,5001800US4810,0.04699003232427516
+5001800US4832,5001800US4832,0.41153177123086754
+5001800US4832,5001800US4805,0.3357794521236251
+5001800US4832,5001800US4824,0.21257067385109382
+5001800US4832,5001800US4830,0.03093020306694007
+5001800US4832,5001800US4803,0.009187899727473496
+5001800US4833,5001800US4833,0.7618571219504003
+5001800US4833,5001800US4806,0.1902009754880439
+5001800US4833,5001800US4825,0.021347497822785538
+5001800US4833,5001800US4812,0.012749000160742005
+5001800US4833,5001800US4830,0.009347027592444646
+5001800US4833,5001800US4824,0.004498376985583602
+5001800US4834,5001800US4834,0.7260866211307312
+5001800US4834,5001800US4827,0.21776602757401928
+5001800US4834,5001800US4815,0.05614735129524958
+5001800US4835,5001800US4835,0.4980094714394129
+5001800US4835,5001800US4828,0.27394251845319606
+5001800US4835,5001800US4821,0.13928400327303098
+5001800US4835,5001800US4837,0.056425867120029505
+5001800US4835,5001800US4827,0.018898362212602864
+5001800US4835,5001800US4820,0.013439777501727668
+5001800US4836,5001800US4836,0.7846067785752631
+5001800US4836,5001800US4814,0.1150296162296036
+5001800US4836,5001800US4802,0.057854278225642736
+5001800US4836,5001800US4808,0.04162963698126486
+5001800US4836,5001800US4829,0.0008796899882256878
+5001800US4901,5001800US4901,0.9091577680566965
+5001800US4901,5001800US4903,0.08594433513534165
+5001800US4901,5001800US4902,0.004897896807961948
+5001800US4902,5001800US4902,0.850299662104018
+5001800US4902,5001800US4901,0.1264724400185614
+5001800US4902,5001800US4904,0.023223540740321163
+5001800US4902,5001800US4903,4.357137099497404e-06
+5001800US4903,5001800US4903,0.8580554067686578
+5001800US4903,5001800US4904,0.1419445932313423
+5001800US4904,5001800US4904,0.6890885107218001
+5001800US4904,5001800US4902,0.16057517251905068
+5001800US4904,5001800US4903,0.11050594686522255
+5001800US4904,5001800US4901,0.03983036989392667
+5001800US5000,5001800US5000,1.0
+5001800US5101,5001800US5107,0.4138336030231669
+5001800US5101,5001800US5101,0.3760757059304695
+5001800US5101,5001800US5110,0.18039576075309813
+5001800US5101,5001800US5105,0.029694930293265413
+5001800US5102,5001800US5102,0.637406523986961
+5001800US5102,5001800US5103,0.19848895092520033
+5001800US5102,5001800US5101,0.16410452508783874
+5001800US5103,5001800US5103,0.8910319597934137
+5001800US5103,5001800US5102,0.10896804020658633
+5001800US5104,5001800US5104,0.826274517976534
+5001800US5104,5001800US5102,0.1736402268167411
+5001800US5104,5001800US5101,8.525520672493071e-05
+5001800US5104,5001800US5103,0.0
+5001800US5105,5001800US5105,0.7519580459694037
+5001800US5105,5001800US5109,0.1122738043233655
+5001800US5105,5001800US5110,0.09247051348238751
+5001800US5105,5001800US5104,0.022080139787935348
+5001800US5105,5001800US5107,0.021217496436907898
+5001800US5106,5001800US5106,0.7818803439226134
+5001800US5106,5001800US5105,0.16303469094355702
+5001800US5106,5001800US5109,0.055084965133829544
+5001800US5107,5001800US5101,0.36917190007713147
+5001800US5107,5001800US5107,0.2482688386662914
+5001800US5107,5001800US5104,0.24557675656745567
+5001800US5107,5001800US5105,0.13698250468912146
+5001800US5108,5001800US5108,0.8917904053884835
+5001800US5108,5001800US5111,0.10820959461151643
+5001800US5109,5001800US5109,0.8736352440574401
+5001800US5109,5001800US5106,0.12636475594255986
+5001800US5110,5001800US5110,0.6431140976040182
+5001800US5110,5001800US5111,0.2229155152185085
+5001800US5110,5001800US5106,0.11234417068259463
+5001800US5110,5001800US5108,0.021626216494878683
+5001800US5111,5001800US5111,0.7142771189258514
+5001800US5111,5001800US5107,0.19590702534952578
+5001800US5111,5001800US5108,0.08625647196668619
+5001800US5111,5001800US5110,0.003559383757936571
+5001800US5301,5001800US5301,0.6544521645742007
+5001800US5301,5001800US5308,0.2143462379510508
+5001800US5301,5001800US5302,0.13046452131906006
+5001800US5301,5001800US5307,0.0007370761556884516
+5001800US5302,5001800US5302,0.7274945610630916
+5001800US5302,5001800US5301,0.2674738973027913
+5001800US5302,5001800US5308,0.005031541634117044
+5001800US5303,5001800US5303,0.9919913928719961
+5001800US5303,5001800US5304,0.007950744082599498
+5001800US5303,5001800US5310,5.786304540440844e-05
+5001800US5304,5001800US5304,0.9752913669888389
+5001800US5304,5001800US5305,0.024708633011161118
+5001800US5305,5001800US5305,1.0
+5001800US5306,5001800US5306,0.9678031875376516
+5001800US5306,5001800US5310,0.03219681246234835
+5001800US5307,5001800US5307,0.93133559866906
+5001800US5307,5001800US5302,0.06398800652663977
+5001800US5307,5001800US5309,0.00374244060117506
+5001800US5307,5001800US5301,0.0009339542031251035
+5001800US5308,5001800US5308,0.7489307823424434
+5001800US5308,5001800US5309,0.1608098906527457
+5001800US5308,5001800US5310,0.039252860518832026
+5001800US5308,5001800US5304,0.03868542971201218
+5001800US5308,5001800US5306,0.01232103677396672
+5001800US5309,5001800US5309,0.856215398176304
+5001800US5309,5001800US5301,0.05388375598456842
+5001800US5309,5001800US5306,0.04078059500546384
+5001800US5309,5001800US5307,0.03738154126660535
+5001800US5309,5001800US5308,0.01173870956705836
+5001800US5310,5001800US5310,0.9179232212990777
+5001800US5310,5001800US5306,0.04179782992767838
+5001800US5310,5001800US5303,0.029684911599845782
+5001800US5310,5001800US5308,0.010594037173398184
+5001800US5401,5001800US5402,0.9959840131183196
+5001800US5401,5001800US5401,0.004015986881680378
+5001800US5402,5001800US5401,0.6415205133288331
+5001800US5402,5001800US5402,0.3584794866711669
+5001800US5403,5001800US5401,1.0
+5001800US5501,5001800US5501,0.8404202053791578
+5001800US5501,5001800US5505,0.15536829376588288
+5001800US5501,5001800US5502,0.004211500854959236
+5001800US5502,5001800US5502,0.9055727408456167
+5001800US5502,5001800US5501,0.08796755651533779
+5001800US5502,5001800US5503,0.0064597026390455226
+5001800US5503,5001800US5503,0.9898182054912683
+5001800US5503,5001800US5507,0.01018179450873167
+5001800US5504,5001800US5504,0.9340872822066614
+5001800US5504,5001800US5501,0.06485038713631118
+5001800US5504,5001800US5505,0.0010623306570274028
+5001800US5505,5001800US5505,0.8350322664299464
+5001800US5505,5001800US5504,0.14214019024061014
+5001800US5505,5001800US5501,0.02017607729314409
+5001800US5505,5001800US5506,0.002651466036299318
+5001800US5506,5001800US5506,0.9870854663832449
+5001800US5506,5001800US5505,0.010109862174732536
+5001800US5506,5001800US5504,0.0027695503327306997
+5001800US5506,5001800US5508,3.512110929187482e-05
+5001800US5507,5001800US5507,0.9930117910625982
+5001800US5507,5001800US5503,0.006988208937401796
+5001800US5508,5001800US5508,0.976997827441602
+5001800US5508,5001800US5506,0.023002172558398034
+5001800US5600,5001800US5600,1.0
diff --git a/policyengine_us_data/storage/upload_completed_datasets.py b/policyengine_us_data/storage/upload_completed_datasets.py
index 650b9873..e302d65a 100644
--- a/policyengine_us_data/storage/upload_completed_datasets.py
+++ b/policyengine_us_data/storage/upload_completed_datasets.py
@@ -15,6 +15,7 @@ def upload_datasets():
         Pooled_3_Year_CPS_2023.file_path,
         CPS_2023.file_path,
         STORAGE_FOLDER / "small_enhanced_cps_2024.h5",
+        STORAGE_FOLDER / "policy_data.db",
     ]
 
     # Filter to only existing files

From a1de133849fe7369a9a3064cc3258a2ed9328c9b Mon Sep 17 00:00:00 2001
From: Ben Ogorek <baogorek@gmail.com>
Date: Fri, 15 Aug 2025 09:23:43 -0400
Subject: [PATCH 17/27] Store policy database in storage folder

---
 policyengine_us_data/db/create_database_tables.py | 6 +++++-
 policyengine_us_data/db/create_initial_strata.py  | 4 +++-
 policyengine_us_data/db/etl_age.py                | 4 +++-
 policyengine_us_data/db/etl_irs_soi.py            | 4 +++-
 policyengine_us_data/db/etl_medicaid.py           | 4 +++-
 policyengine_us_data/db/etl_snap.py               | 6 ++++--
 6 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py
index ec42fb61..cf0213ef 100644
--- a/policyengine_us_data/db/create_database_tables.py
+++ b/policyengine_us_data/db/create_database_tables.py
@@ -12,6 +12,8 @@
     create_engine,
 )
 
+from policyengine_us_data.storage import STORAGE_FOLDER
+
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -176,7 +178,9 @@ def calculate_definition_hash(mapper, connection, target: Stratum):
     )
 
 
-def create_database(db_uri="sqlite:///policy_data.db"):
+def create_database(
+    db_uri: str = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}",
+):
     """
     Creates a SQLite database and all the defined tables.
 
diff --git a/policyengine_us_data/db/create_initial_strata.py b/policyengine_us_data/db/create_initial_strata.py
index 068bca30..5653948b 100644
--- a/policyengine_us_data/db/create_initial_strata.py
+++ b/policyengine_us_data/db/create_initial_strata.py
@@ -3,6 +3,8 @@
 import pandas as pd
 from sqlmodel import Session, create_engine
 
+from policyengine_us_data.storage import STORAGE_FOLDER
+
 
 from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import (
     UCGID,
@@ -32,7 +34,7 @@ def main():
         .reset_index(drop=True)
     )
 
-    DATABASE_URL = "sqlite:///policy_data.db"
+    DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}"
     engine = create_engine(DATABASE_URL)
 
     # map the ucgid_str 'code' to auto-generated 'stratum_id'
diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py
index bc540373..f8b3e0a6 100644
--- a/policyengine_us_data/db/etl_age.py
+++ b/policyengine_us_data/db/etl_age.py
@@ -2,6 +2,8 @@
 import numpy as np
 from sqlmodel import Session, create_engine
 
+from policyengine_us_data.storage import STORAGE_FOLDER
+
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -103,7 +105,7 @@ def load_age_data(df_long, geo, year, stratum_lookup=None):
         raise ValueError('geo must be one of "National", "State", "District"')
 
     # Prepare to load data -----------
-    DATABASE_URL = "sqlite:///policy_data.db"
+    DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}"
     engine = create_engine(DATABASE_URL)
 
     if stratum_lookup is None:
diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
index 5e28e464..ecbec177 100644
--- a/policyengine_us_data/db/etl_irs_soi.py
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -5,6 +5,8 @@
 
 from sqlmodel import Session, create_engine
 
+from policyengine_us_data.storage import STORAGE_FOLDER
+
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -283,7 +285,7 @@ def transform_soi_data(raw_df):
 
 def load_soi_data(long_dfs, year):
 
-    DATABASE_URL = "sqlite:///policy_data.db"
+    DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}"
     engine = create_engine(DATABASE_URL)
 
     session = Session(engine)
diff --git a/policyengine_us_data/db/etl_medicaid.py b/policyengine_us_data/db/etl_medicaid.py
index 4ff96278..926a0d88 100644
--- a/policyengine_us_data/db/etl_medicaid.py
+++ b/policyengine_us_data/db/etl_medicaid.py
@@ -3,6 +3,8 @@
 import pandas as pd
 from sqlmodel import Session, create_engine
 
+from policyengine_us_data.storage import STORAGE_FOLDER
+
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -83,7 +85,7 @@ def transform_medicaid_data(state_admin_df, cd_survey_df, year):
 
 def load_medicaid_data(long_state, long_cd, year):
 
-    DATABASE_URL = "sqlite:///policy_data.db"
+    DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}"
     engine = create_engine(DATABASE_URL)
 
     stratum_lookup = {}
diff --git a/policyengine_us_data/db/etl_snap.py b/policyengine_us_data/db/etl_snap.py
index a60c0074..1fba44a4 100644
--- a/policyengine_us_data/db/etl_snap.py
+++ b/policyengine_us_data/db/etl_snap.py
@@ -7,6 +7,8 @@
 import us
 from sqlmodel import Session, create_engine
 
+from policyengine_us_data.storage import STORAGE_FOLDER
+
 from policyengine_us_data.db.create_database_tables import (
     Stratum,
     StratumConstraint,
@@ -144,7 +146,7 @@ def transform_survey_snap_data(raw_df):
 
 def load_administrative_snap_data(df_states, year):
 
-    DATABASE_URL = "sqlite:///policy_data.db"
+    DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}"
     engine = create_engine(DATABASE_URL)
 
     stratum_lookup = {}
@@ -232,7 +234,7 @@ def load_survey_snap_data(survey_df, year, stratum_lookup=None):
     if stratum_lookup is None:
         raise ValueError("stratum_lookup must be provided")
 
-    DATABASE_URL = "sqlite:///policy_data.db"
+    DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}"
     engine = create_engine(DATABASE_URL)
 
     with Session(engine) as session:

From b3767264e3e15413e3a5c018f42c08cdb0fa61d0 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Fri, 15 Aug 2025 09:35:21 -0400
Subject: [PATCH 18/27] adding make database to reusable test. Updating
 changelog_entry

---
 .github/workflows/reusable_test.yaml | 6 +++++-
 changelog_entry.yaml                 | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/reusable_test.yaml b/.github/workflows/reusable_test.yaml
index f9c5b49a..dce1daf4 100644
--- a/.github/workflows/reusable_test.yaml
+++ b/.github/workflows/reusable_test.yaml
@@ -58,6 +58,10 @@ jobs:
         if: inputs.full_suite
         run: make download
 
+      - name: Create and load calibration targets database 
+        if: inputs.full_suite
+        run: make database
+
       - name: Build datasets
         if: inputs.full_suite
         run: make data
@@ -90,4 +94,4 @@ jobs:
         with:
           branch: gh-pages
           folder: docs/_build/html
-          clean: true
\ No newline at end of file
+          clean: true
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 5bd54961..b3a3fb5d 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,4 +1,4 @@
 - bump: minor
   changes:
     added:
-    - load script for eitc targets
+    - add SQLite database for calibration targets 

From 9078ed9a578e805e36d08bdb376937d0c1f4409d Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Fri, 15 Aug 2025 09:39:16 -0400
Subject: [PATCH 19/27] removing TODOs

---
 policyengine_us_data/db/etl_age.py   | 1 -
 policyengine_us_data/utils/census.py | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py
index f8b3e0a6..bb83067c 100644
--- a/policyengine_us_data/db/etl_age.py
+++ b/policyengine_us_data/db/etl_age.py
@@ -134,7 +134,6 @@ def load_age_data(df_long, geo, year, stratum_lookup=None):
             )
 
             # Create constraints and link them to the parent's relationship attribute.
-            # TODO: greater_than_or_equal_to to just greater than!
             new_stratum.constraints_rel = [
                 StratumConstraint(
                     constraint_variable="ucgid_str",
diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py
index fb577e60..2f424ccb 100644
--- a/policyengine_us_data/utils/census.py
+++ b/policyengine_us_data/utils/census.py
@@ -127,8 +127,8 @@ def get_census_docs(year):
     docs_url = (
         f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json"
     )
-    # TODO: Alternative: incorporate it!
-    "https://api.census.gov/data/2023/acs/acs1/variables.json"
+    # NOTE: The URL for detail tables, should we ever need it is:
+    # "https://api.census.gov/data/2023/acs/acs1/variables.json"
 
     docs_response = requests.get(docs_url)
     docs_response.raise_for_status()

From 9913e3c5f271c59e278d69e9a762c56d8d848b0e Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Fri, 15 Aug 2025 10:00:41 -0400
Subject: [PATCH 20/27] Removed troublesome logging. Updated Makefile

---
 Makefile                                          | 4 +---
 policyengine_us_data/db/create_database_tables.py | 3 ---
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 01999135..21a39686 100644
--- a/Makefile
+++ b/Makefile
@@ -67,9 +67,6 @@ database:
 	python policyengine_us_data/db/etl_snap.py
 	python policyengine_us_data/db/etl_irs_soi.py
 
-clean-database:
-	rm *.db
-
 data:
 	python policyengine_us_data/utils/uprating.py
 	python policyengine_us_data/datasets/acs/acs.py
@@ -84,6 +81,7 @@ data:
 
 clean:
 	rm -f policyengine_us_data/storage/*.h5
+	rm -f policyengine_us_data/storage/*.db
 	git clean -fX -- '*.csv'
 	rm -rf policyengine_us_data/docs/_build
 
diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py
index cf0213ef..d6cfc8ec 100644
--- a/policyengine_us_data/db/create_database_tables.py
+++ b/policyengine_us_data/db/create_database_tables.py
@@ -173,9 +173,6 @@ def calculate_definition_hash(mapper, connection, target: Stratum):
     fingerprint_text = "\n".join(constraint_strings)
     h = hashlib.sha256(fingerprint_text.encode("utf-8"))
     target.definition_hash = h.hexdigest()
-    logger.info(
-        f"Set definition_hash for Stratum to '{target.definition_hash}'"
-    )
 
 
 def create_database(

From fddc3acafa14f0d95170b7752eb6fc624658a757 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Fri, 15 Aug 2025 16:14:56 -0400
Subject: [PATCH 21/27] updated comments based on feedback. Removed old make
 target

---
 Makefile                               |  6 ------
 policyengine_us_data/db/etl_irs_soi.py | 18 +++++++++---------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index 21a39686..795a77b6 100644
--- a/Makefile
+++ b/Makefile
@@ -22,12 +22,6 @@ changelog:
 download:
 	python policyengine_us_data/storage/download_private_prerequisites.py
 
-targets:
-	python policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py
-	python policyengine_us_data/storage/calibration_targets/pull_age_targets.py
-	python policyengine_us_data/storage/calibration_targets/pull_soi_targets.py
-	python policyengine_us_data/storage/calibration_targets/pull_snap_targets.py
-
 upload:
 	python policyengine_us_data/storage/upload_completed_datasets.py
 
diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
index ecbec177..74abab9e 100644
--- a/policyengine_us_data/db/etl_irs_soi.py
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -27,8 +27,8 @@
 
 """See the 22incddocguide.docx manual from the IRS SOI"""
 # Let's make this work with strict inequalities
-# Interpret Language: '$10,000 under $25,000'
-epsilon = 0.005  # Half a penny
+# Language in the doc: '$10,000 under $25,000'
+epsilon = 0.005  # i.e., half a penny
 AGI_STUB_TO_INCOME_RANGE = {
     1: (-np.inf, 1),
     2: (1 - epsilon, 10_000),
@@ -191,7 +191,7 @@ def transform_soi_data(raw_df):
 
     # State -------------------
     # You've got agi_stub == 0 in here, which you want to use any time you don't want to
-    # break things up by AGI
+    # divide data by AGI classes (i.e., agi_stub)
     state_df = raw_df.copy().loc[
         (raw_df.STATE != "US") & (raw_df.CONG_DISTRICT == 0)
     ]
@@ -200,7 +200,6 @@ def transform_soi_data(raw_df):
     ).str.zfill(2)
 
     # District ------------------
-    # This is going to fail because we're missing the single cong district states
     district_df = raw_df.copy().loc[(raw_df.CONG_DISTRICT > 0)]
 
     max_cong_district_by_state = raw_df.groupby("STATE")[
@@ -284,6 +283,7 @@ def transform_soi_data(raw_df):
 
 
 def load_soi_data(long_dfs, year):
+    """Load a list of databases into the db, critically dependent on order"""
 
     DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}"
     engine = create_engine(DATABASE_URL)
@@ -291,7 +291,6 @@ def load_soi_data(long_dfs, year):
     session = Session(engine)
 
     # Load EITC data --------------------------------------------------------
-    # Obviously this is not especially robust ---
     eitc_data = {
         "0": (long_dfs[0], long_dfs[1]),
         "1": (long_dfs[2], long_dfs[3]),
@@ -377,7 +376,7 @@ def load_soi_data(long_dfs, year):
 
     session.commit()
 
-    # No breakdown variables in this set
+    # There are no breakdown variables used in the following set
     for j in range(8, 42, 2):
         count_j, amount_j = long_dfs[j], long_dfs[j + 1]
         amount_variable_name = amount_j.iloc[0][["target_variable"]].values[0]
@@ -446,8 +445,7 @@ def load_soi_data(long_dfs, year):
         agi_stub = agi_df.iloc[0][["breakdown_value"]].values[0]
         agi_income_lower, agi_income_upper = AGI_STUB_TO_INCOME_RANGE[agi_stub]
 
-        # Make a National Stratum for each AGI Stub, even though there's no national target
-        # There no national target because the data set only has agi_stub = 0 for national
+        # Make a National Stratum for each AGI Stub even w/o associated national target
         note = f"Geo: 0100000US, AGI > {agi_income_lower}, AGI < {agi_income_upper}"
         nat_stratum = Stratum(
             parent_stratum_id=None, stratum_group_id=0, notes=note
@@ -540,7 +538,9 @@ def load_soi_data(long_dfs, year):
 
 
 def main():
-    year = 2022  # NOTE: predates the finalization of the 2020 Census redistricting
+    # NOTE: predates the finalization of the 2020 Census redistricting
+    # and there is district mapping in the Transform step
+    year = 2022
 
     # Extract -----------------------
     raw_df = extract_soi_data()

From 0cf920a157aa30675a5c51ba64ae404380ac1e8c Mon Sep 17 00:00:00 2001
From: Ben Ogorek <baogorek@gmail.com>
Date: Mon, 18 Aug 2025 12:52:48 -0400
Subject: [PATCH 22/27] test: move database tests into package

---
 policyengine_us_data/tests/test_database.py | 110 ++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 policyengine_us_data/tests/test_database.py

diff --git a/policyengine_us_data/tests/test_database.py b/policyengine_us_data/tests/test_database.py
new file mode 100644
index 00000000..13c102e5
--- /dev/null
+++ b/policyengine_us_data/tests/test_database.py
@@ -0,0 +1,110 @@
+import hashlib
+from enum import Enum
+
+import pytest
+from sqlalchemy.exc import IntegrityError
+from sqlmodel import Session, select
+
+from policyengine_us_data.db.create_database_tables import (
+    Stratum,
+    StratumConstraint,
+    Target,
+    create_database,
+)
+from policyengine_us_data.db import create_initial_strata
+
+
+@pytest.fixture
+def engine(tmp_path):
+    db_uri = f"sqlite:///{tmp_path/'test.db'}"
+    return create_database(db_uri)
+
+
+def test_stratum_hash_and_relationships(engine):
+    with Session(engine) as session:
+        stratum = Stratum(notes="test", stratum_group_id=0)
+        stratum.constraints_rel = [
+            StratumConstraint(
+                constraint_variable="ucgid_str", operation="in", value="0001"
+            ),
+            StratumConstraint(
+                constraint_variable="age", operation="greater_than", value="20"
+            ),
+            StratumConstraint(
+                constraint_variable="age", operation="less_than", value="65"
+            ),
+        ]
+        stratum.targets_rel = [
+            Target(variable="person_count", period=2023, value=100.0)
+        ]
+        session.add(stratum)
+        session.commit()
+        expected_hash = hashlib.sha256(
+            "\n".join(
+                sorted(
+                    [
+                        "ucgid_str|in|0001",
+                        "age|greater_than|20",
+                        "age|less_than|65",
+                    ]
+                )
+            ).encode("utf-8")
+        ).hexdigest()
+        assert stratum.definition_hash == expected_hash
+        retrieved = session.get(Stratum, stratum.stratum_id)
+        assert len(retrieved.constraints_rel) == 3
+        assert retrieved.targets_rel[0].value == 100.0
+
+
+def test_unique_definition_hash(engine):
+    with Session(engine) as session:
+        s1 = Stratum(stratum_group_id=0)
+        s1.constraints_rel = [
+            StratumConstraint(
+                constraint_variable="ucgid_str", operation="in", value="0001"
+            )
+        ]
+        session.add(s1)
+        session.commit()
+        s2 = Stratum(stratum_group_id=0)
+        s2.constraints_rel = [
+            StratumConstraint(
+                constraint_variable="ucgid_str", operation="in", value="0001"
+            )
+        ]
+        session.add(s2)
+        with pytest.raises(IntegrityError):
+            session.commit()
+
+
+def test_create_initial_strata(monkeypatch, engine, tmp_path):
+    # ``monkeypatch`` is a pytest fixture that lets us temporarily modify or replace
+    # objects during a test. Here we use it to point ``STORAGE_FOLDER`` to a
+    # temporary directory so the test doesn't touch real data on disk.
+    monkeypatch.setattr(create_initial_strata, "STORAGE_FOLDER", tmp_path)
+
+    class FakeEnum(Enum):
+        NAT = "NAT"
+        STATE = "STATE"
+        DIST = "DIST"
+
+        def get_hierarchical_codes(self):
+            mapping = {
+                FakeEnum.NAT: ["NAT"],
+                FakeEnum.STATE: ["STATE", "NAT"],
+                FakeEnum.DIST: ["DIST", "STATE", "NAT"],
+            }
+            return mapping[self]
+
+    # Replace the real ``UCGID`` enumeration with our simplified version so the
+    # test can run without downloading geographic data.
+    monkeypatch.setattr(create_initial_strata, "UCGID", FakeEnum)
+    create_initial_strata.main()
+    with Session(engine) as session:
+        strata = session.exec(select(Stratum).order_by(Stratum.stratum_id)).all()
+        assert len(strata) == 3
+        nat, state, dist = strata
+        assert state.parent_stratum_id == nat.stratum_id
+        assert dist.parent_stratum_id == state.stratum_id
+        codes = [s.constraints_rel[0].value for s in strata]
+        assert codes == ["NAT", "STATE", "DIST"]

From 0571ff547e1fbf73a4f41fda0d5ad51636a7b509 Mon Sep 17 00:00:00 2001
From: Ben Ogorek <baogorek@gmail.com>
Date: Mon, 18 Aug 2025 14:31:24 -0400
Subject: [PATCH 23/27] Add Great Expectations validation for database

---
 Makefile                                      | 13 ++++----
 .../checkpoints/policy_data_checkpoint.yml    | 13 ++++++++
 .../expectations/policy_data_suite.json       | 12 +++++++
 great_expectations/great_expectations.yml     | 31 +++++++++++++++++++
 policyengine_us_data/db/validate_database.py  | 18 +++++++++++
 pyproject.toml                                |  1 +
 6 files changed, 82 insertions(+), 6 deletions(-)
 create mode 100644 great_expectations/checkpoints/policy_data_checkpoint.yml
 create mode 100644 great_expectations/expectations/policy_data_suite.json
 create mode 100644 great_expectations/great_expectations.yml
 create mode 100644 policyengine_us_data/db/validate_database.py

diff --git a/Makefile b/Makefile
index 795a77b6..79675070 100644
--- a/Makefile
+++ b/Makefile
@@ -54,12 +54,13 @@ documentation-dev:
 	myst start
 
 database:
-	python policyengine_us_data/db/create_database_tables.py
-	python policyengine_us_data/db/create_initial_strata.py
-	python policyengine_us_data/db/etl_age.py
-	python policyengine_us_data/db/etl_medicaid.py
-	python policyengine_us_data/db/etl_snap.py
-	python policyengine_us_data/db/etl_irs_soi.py
+        python policyengine_us_data/db/create_database_tables.py
+        python policyengine_us_data/db/create_initial_strata.py
+        python policyengine_us_data/db/etl_age.py
+        python policyengine_us_data/db/etl_medicaid.py
+        python policyengine_us_data/db/etl_snap.py
+        python policyengine_us_data/db/etl_irs_soi.py
+        python policyengine_us_data/db/validate_database.py
 
 data:
 	python policyengine_us_data/utils/uprating.py
diff --git a/great_expectations/checkpoints/policy_data_checkpoint.yml b/great_expectations/checkpoints/policy_data_checkpoint.yml
new file mode 100644
index 00000000..45111b50
--- /dev/null
+++ b/great_expectations/checkpoints/policy_data_checkpoint.yml
@@ -0,0 +1,13 @@
+name: policy_data_checkpoint
+config_version: 1.0
+class_name: SimpleCheckpoint
+validations:
+  - batch_request:
+      datasource_name: policy_db
+      data_connector_name: default_runtime_data_connector_name
+      data_asset_name: strata
+      runtime_parameters:
+        query: SELECT * FROM strata
+      batch_identifiers:
+        default_identifier_name: default
+    expectation_suite_name: policy_data_suite
diff --git a/great_expectations/expectations/policy_data_suite.json b/great_expectations/expectations/policy_data_suite.json
new file mode 100644
index 00000000..7da34546
--- /dev/null
+++ b/great_expectations/expectations/policy_data_suite.json
@@ -0,0 +1,12 @@
+{
+  "expectation_suite_name": "policy_data_suite",
+  "expectations": [
+    {
+      "expectation_type": "expect_table_row_count_to_be_greater_than",
+      "kwargs": {"value": 0}
+    }
+  ],
+  "meta": {
+    "great_expectations_version": "0.18"
+  }
+}
diff --git a/great_expectations/great_expectations.yml b/great_expectations/great_expectations.yml
new file mode 100644
index 00000000..f4ba7441
--- /dev/null
+++ b/great_expectations/great_expectations.yml
@@ -0,0 +1,31 @@
+config_version: 3.0
+datasources:
+  policy_db:
+    class_name: Datasource
+    execution_engine:
+      class_name: SqlAlchemyExecutionEngine
+      connection_string: sqlite:///policyengine_us_data/storage/policy_data.db
+    data_connectors:
+      default_runtime_data_connector_name:
+        class_name: RuntimeDataConnector
+        batch_identifiers:
+          - default_identifier_name
+stores:
+  expectations_store:
+    class_name: ExpectationsStore
+    store_backend:
+      class_name: InlineStoreBackend
+  validations_store:
+    class_name: ValidationsStore
+    store_backend:
+      class_name: InlineStoreBackend
+  checkpoint_store:
+    class_name: CheckpointStore
+    store_backend:
+      class_name: InlineStoreBackend
+expectations_store_name: expectations_store
+validations_store_name: validations_store
+checkpoint_store_name: checkpoint_store
+data_docs_sites: {}
+anonymous_usage_statistics:
+  enabled: false
diff --git a/policyengine_us_data/db/validate_database.py b/policyengine_us_data/db/validate_database.py
new file mode 100644
index 00000000..4ea1203d
--- /dev/null
+++ b/policyengine_us_data/db/validate_database.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+import great_expectations as ge
+
+
+def main() -> None:
+    """Run Great Expectations validation on the policy data database."""
+    # Ensure we load the DataContext from the repository root
+    context = ge.get_context()
+    # Execute the checkpoint configured for the policy database
+    result = context.run_checkpoint(checkpoint_name="policy_data_checkpoint")
+    if not result["success"]:
+        raise ValueError("Great Expectations validation failed")
+    print("Great Expectations validation succeeded")
+
+
+if __name__ == "__main__":  # pragma: no cover - script entry point
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 3e87a403..16ca41f0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,7 @@ dependencies = [
     "sqlalchemy>=2.0.41",
     "sqlmodel>=0.0.24",
     "xlrd>=2.0.2",
+    "great_expectations>=0.18.0",
 ]
 
 [project.optional-dependencies]

From bdef5011c93ae91c44b21a8414fb7a8e3769e955 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Mon, 18 Aug 2025 18:49:24 -0400
Subject: [PATCH 24/27] working pre lint

---
 Makefile                                      | 14 ++--
 .../checkpoints/policy_data_checkpoint.yml    | 13 ----
 .../expectations/policy_data_suite.json       | 12 ---
 great_expectations/great_expectations.yml     | 31 --------
 .../db/create_database_tables.py              | 12 ++-
 policyengine_us_data/db/etl_irs_soi.py        | 76 +++++++------------
 policyengine_us_data/db/validate_database.py  | 31 ++++----
 policyengine_us_data/tests/test_database.py   | 42 +---------
 pyproject.toml                                |  1 -
 9 files changed, 65 insertions(+), 167 deletions(-)
 delete mode 100644 great_expectations/checkpoints/policy_data_checkpoint.yml
 delete mode 100644 great_expectations/expectations/policy_data_suite.json
 delete mode 100644 great_expectations/great_expectations.yml

diff --git a/Makefile b/Makefile
index 79675070..b03e23d5 100644
--- a/Makefile
+++ b/Makefile
@@ -54,13 +54,13 @@ documentation-dev:
 	myst start
 
 database:
-        python policyengine_us_data/db/create_database_tables.py
-        python policyengine_us_data/db/create_initial_strata.py
-        python policyengine_us_data/db/etl_age.py
-        python policyengine_us_data/db/etl_medicaid.py
-        python policyengine_us_data/db/etl_snap.py
-        python policyengine_us_data/db/etl_irs_soi.py
-        python policyengine_us_data/db/validate_database.py
+	python policyengine_us_data/db/create_database_tables.py
+	python policyengine_us_data/db/create_initial_strata.py
+	python policyengine_us_data/db/etl_age.py
+	python policyengine_us_data/db/etl_medicaid.py
+	python policyengine_us_data/db/etl_snap.py
+	python policyengine_us_data/db/etl_irs_soi.py
+	python policyengine_us_data/db/validate_database.py
 
 data:
 	python policyengine_us_data/utils/uprating.py
diff --git a/great_expectations/checkpoints/policy_data_checkpoint.yml b/great_expectations/checkpoints/policy_data_checkpoint.yml
deleted file mode 100644
index 45111b50..00000000
--- a/great_expectations/checkpoints/policy_data_checkpoint.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: policy_data_checkpoint
-config_version: 1.0
-class_name: SimpleCheckpoint
-validations:
-  - batch_request:
-      datasource_name: policy_db
-      data_connector_name: default_runtime_data_connector_name
-      data_asset_name: strata
-      runtime_parameters:
-        query: SELECT * FROM strata
-      batch_identifiers:
-        default_identifier_name: default
-    expectation_suite_name: policy_data_suite
diff --git a/great_expectations/expectations/policy_data_suite.json b/great_expectations/expectations/policy_data_suite.json
deleted file mode 100644
index 7da34546..00000000
--- a/great_expectations/expectations/policy_data_suite.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "expectation_suite_name": "policy_data_suite",
-  "expectations": [
-    {
-      "expectation_type": "expect_table_row_count_to_be_greater_than",
-      "kwargs": {"value": 0}
-    }
-  ],
-  "meta": {
-    "great_expectations_version": "0.18"
-  }
-}
diff --git a/great_expectations/great_expectations.yml b/great_expectations/great_expectations.yml
deleted file mode 100644
index f4ba7441..00000000
--- a/great_expectations/great_expectations.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-config_version: 3.0
-datasources:
-  policy_db:
-    class_name: Datasource
-    execution_engine:
-      class_name: SqlAlchemyExecutionEngine
-      connection_string: sqlite:///policyengine_us_data/storage/policy_data.db
-    data_connectors:
-      default_runtime_data_connector_name:
-        class_name: RuntimeDataConnector
-        batch_identifiers:
-          - default_identifier_name
-stores:
-  expectations_store:
-    class_name: ExpectationsStore
-    store_backend:
-      class_name: InlineStoreBackend
-  validations_store:
-    class_name: ValidationsStore
-    store_backend:
-      class_name: InlineStoreBackend
-  checkpoint_store:
-    class_name: CheckpointStore
-    store_backend:
-      class_name: InlineStoreBackend
-expectations_store_name: expectations_store
-validations_store_name: validations_store
-checkpoint_store_name: checkpoint_store
-data_docs_sites: {}
-anonymous_usage_statistics:
-  enabled: false
diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py
index d6cfc8ec..4c451467 100644
--- a/policyengine_us_data/db/create_database_tables.py
+++ b/policyengine_us_data/db/create_database_tables.py
@@ -1,19 +1,21 @@
 import logging
 import hashlib
 from typing import List, Optional
+from enum import Enum
 
 from sqlalchemy import event, UniqueConstraint
 from sqlalchemy.orm.attributes import get_history
-
 from sqlmodel import (
     Field,
     Relationship,
     SQLModel,
     create_engine,
 )
+from policyengine_us.system import system 
 
 from policyengine_us_data.storage import STORAGE_FOLDER
 
+
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -22,6 +24,10 @@
 logger = logging.getLogger(__name__)
 
 
+# An Enum type to ensure the variable exists in policyengine-us
+USVariable = Enum("USVariable", {name: name for name in system.variables.keys()}, type=str)
+
+
 class Stratum(SQLModel, table=True):
     """Represents a unique population subgroup (stratum)."""
 
@@ -81,7 +87,7 @@ class StratumConstraint(SQLModel, table=True):
     __tablename__ = "stratum_constraints"
 
     stratum_id: int = Field(foreign_key="strata.stratum_id", primary_key=True)
-    constraint_variable: str = Field(
+    constraint_variable: USVariable = Field(
         primary_key=True,
         description="The variable the constraint applies to (e.g., 'age').",
     )
@@ -114,7 +120,7 @@ class Target(SQLModel, table=True):
     )
 
     target_id: Optional[int] = Field(default=None, primary_key=True)
-    variable: str = Field(
+    variable: USVariable = Field(
         description="A variable defined in policyengine-us (e.g., 'income_tax')."
     )
     period: int = Field(
diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
index 74abab9e..bda932fd 100644
--- a/policyengine_us_data/db/etl_irs_soi.py
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -80,9 +80,9 @@ def make_records(
 def make_agi_long(df: pd.DataFrame) -> pd.DataFrame:
     """Convert IRS SOI AGI‑split table from wide to the long format used"""
     target_col_map = {
-        "N1": "agi_tax_unit_count",
-        "N2": "agi_person_count",
-        "A00100": "agi_total_amount",
+        "N1": "tax_unit_count",
+        "N2": "person_count",
+        "A00100": "adjusted_gross_income",
     }
     work = df[["ucgid_str", "agi_stub"] + list(target_col_map)].rename(
         columns=target_col_map
@@ -156,32 +156,27 @@ def extract_soi_data() -> pd.DataFrame:
 def transform_soi_data(raw_df):
 
     TARGETS = [
-        dict(code="59661", name="eitc", breakdown=("eitc_children", 0)),
-        dict(code="59662", name="eitc", breakdown=("eitc_children", 1)),
-        dict(code="59663", name="eitc", breakdown=("eitc_children", 2)),
-        dict(code="59664", name="eitc", breakdown=("eitc_children", "3+")),
-        dict(code="59664", name="qbid", breakdown=None),
+        dict(code="59661", name="eitc", breakdown=("eitc_child_count", 0)),
+        dict(code="59662", name="eitc", breakdown=("eitc_child_count", 1)),
+        dict(code="59663", name="eitc", breakdown=("eitc_child_count", 2)),
+        dict(code="59664", name="eitc", breakdown=("eitc_child_count", "3+")),
+        dict(code="59664", name="qualified_business_income_deduction", breakdown=None),
         dict(code="18500", name="real_estate_taxes", breakdown=None),
         dict(code="01000", name="net_capital_gain", breakdown=None),
-        dict(code="03150", name="ira_payments", breakdown=None),
-        dict(code="00300", name="taxable_interest", breakdown=None),
-        dict(code="00400", name="tax_exempt_interest", breakdown=None),
-        dict(code="00600", name="oridinary_dividends", breakdown=None),
-        dict(code="00650", name="qualified_dividends", breakdown=None),
+        dict(code="03150", name="retirement_distributions", breakdown=None),
+        dict(code="00300", name="taxable_interest_income", breakdown=None),
+        dict(code="00400", name="tax_exempt_interest_income", breakdown=None),
+        dict(code="00600", name="non_qualified_dividend_income", breakdown=None),
+        dict(code="00650", name="qualified_dividend_income", breakdown=None),
         dict(
             code="26270",
-            name="partnership_and_s_crop_net_income",
+            name="partnership_s_corp_income",
             breakdown=None,
         ),
-        dict(code="02500", name="total_social_security", breakdown=None),
-        dict(code="01700", name="pension_and_annuities", breakdown=None),
+        dict(code="02500", name="social_security", breakdown=None),
         dict(code="02300", name="unemployment_compensation", breakdown=None),
-        dict(code="00900", name="business_net_income", breakdown=None),
-        dict(
-            code="17000", name="medical_and_dental_deduction", breakdown=None
-        ),
-        dict(code="00700", name="salt_refunds", breakdown=None),
-        dict(code="18425", name="salt_amount", breakdown=None),
+        dict(code="00700", name="salt_refund_income", breakdown=None),
+        dict(code="18425", name="reported_salt", breakdown=None),
         dict(code="06500", name="income_tax", breakdown=None),
     ]
 
@@ -257,7 +252,7 @@ def transform_soi_data(raw_df):
     temp_df = df[["ucgid_str"]].copy()
     temp_df["breakdown_variable"] = "one"
     temp_df["breakdown_value"] = 1
-    temp_df["target_variable"] = "agi"
+    temp_df["target_variable"] = "adjusted_gross_income"
     temp_df["target_value"] = df["A00100"] * 1_000
 
     records.append(temp_df)
@@ -349,14 +344,6 @@ def load_soi_data(long_dfs, year):
                 )
 
             new_stratum.targets_rel = [
-                # It's already complex enough
-                # Target(
-                #    variable="tax_unit_count",
-                #    period=year,
-                #    value=eitc_count_i.iloc[i][["target_value"]].values[0],
-                #    source_id=5,
-                #    active=True,
-                # ),
                 Target(
                     variable="eitc",
                     period=year,
@@ -377,7 +364,7 @@ def load_soi_data(long_dfs, year):
     session.commit()
 
     # There are no breakdown variables used in the following set
-    for j in range(8, 42, 2):
+    for j in range(8, 36, 2):
         count_j, amount_j = long_dfs[j], long_dfs[j + 1]
         amount_variable_name = amount_j.iloc[0][["target_variable"]].values[0]
         print(
@@ -391,16 +378,6 @@ def load_soi_data(long_dfs, year):
             amount_value = amount_j.iloc[i][["target_value"]].values[0]
 
             stratum.targets_rel.append(
-                # NOTE: If I do the counts, I'm going to need to explode the strata for the vars != 0
-                # OR, create new variables like qbid_tax_unit_count which requires adding stuff to -us
-                # AND, it's already complex enough -----
-                # Target(
-                #    variable="tax_unit_count",
-                #    period=year,
-                #    value=count_j.iloc[i][["target_value"]].values[0],
-                #    source_id=5,
-                #    active=True,
-                # ),
                 Target(
                     variable=amount_variable_name,
                     period=year,
@@ -416,14 +393,15 @@ def load_soi_data(long_dfs, year):
     session.commit()
 
     # Adjusted Gross Income ------
-    agi_values = long_dfs[42]
+    agi_values = long_dfs[36]
+    assert agi_values[['target_variable']].values[0] == 'adjusted_gross_income'
 
     for i in range(agi_values.shape[0]):
         ucgid_i = agi_values[["ucgid_str"]].iloc[i].values[0]
         stratum = get_simple_stratum_by_ucgid(session, ucgid_i)
         stratum.targets_rel.append(
             Target(
-                variable="agi",
+                variable="adjusted_gross_income",
                 period=year,
                 value=agi_values.iloc[i][["target_value"]].values[0],
                 source_id=5,
@@ -438,7 +416,7 @@ def load_soi_data(long_dfs, year):
     agi_person_count_dfs = [
         df
         for df in long_dfs[43:]
-        if df["target_variable"].iloc[0] == "agi_person_count"
+        if df["target_variable"].iloc[0] == "person_count"
     ]
 
     for agi_df in agi_person_count_dfs:
@@ -458,12 +436,12 @@ def load_soi_data(long_dfs, year):
                     value="0100000US",
                 ),
                 StratumConstraint(
-                    constraint_variable="agi",
+                    constraint_variable="adjusted_gross_income",
                     operation="greater_than",
                     value=str(agi_income_lower),
                 ),
                 StratumConstraint(
-                    constraint_variable="agi",
+                    constraint_variable="adjusted_gross_income",
                     operation="less_than",
                     value=str(agi_income_upper),
                 ),
@@ -505,12 +483,12 @@ def load_soi_data(long_dfs, year):
                         value=ucgid_i,
                     ),
                     StratumConstraint(
-                        constraint_variable="agi",
+                        constraint_variable="adjusted_gross_income",
                         operation="greater_than",
                         value=str(agi_income_lower),
                     ),
                     StratumConstraint(
-                        constraint_variable="agi",
+                        constraint_variable="adjusted_gross_income",
                         operation="less_than",
                         value=str(agi_income_upper),
                     ),
diff --git a/policyengine_us_data/db/validate_database.py b/policyengine_us_data/db/validate_database.py
index 4ea1203d..94dcdddf 100644
--- a/policyengine_us_data/db/validate_database.py
+++ b/policyengine_us_data/db/validate_database.py
@@ -1,18 +1,23 @@
-from __future__ import annotations
+"""
+This is the start of a data validation pipeline. It is meant to be a separate
+validation track from the unit tests in policyengine_us_data/tests in that it tests
+the overall correctness of data after a full pipeline run with production data.
+"""
+import sqlite3
 
-import great_expectations as ge
+import pandas as pd
+from policyengine_us.system import system 
 
 
-def main() -> None:
-    """Run Great Expectations validation on the policy data database."""
-    # Ensure we load the DataContext from the repository root
-    context = ge.get_context()
-    # Execute the checkpoint configured for the policy database
-    result = context.run_checkpoint(checkpoint_name="policy_data_checkpoint")
-    if not result["success"]:
-        raise ValueError("Great Expectations validation failed")
-    print("Great Expectations validation succeeded")
+conn = sqlite3.connect("policyengine_us_data/storage/policy_data.db")
 
+stratum_constraints_df = pd.read_sql("SELECT * FROM stratum_constraints", conn)
+targets_df = pd.read_sql("SELECT * FROM targets", conn)
 
-if __name__ == "__main__":  # pragma: no cover - script entry point
-    main()
+for var_name in set(targets_df['variable']):
+    if not var_name in system.variables.keys():
+        raise ValueError(f'{var_name} not a policyengine-us variable')
+
+for var_name in set(stratum_constraints_df['constraint_variable']):
+    if not var_name in system.variables.keys():
+        raise ValueError(f'{var_name} not a policyengine-us variable')
diff --git a/policyengine_us_data/tests/test_database.py b/policyengine_us_data/tests/test_database.py
index 13c102e5..ce5eb211 100644
--- a/policyengine_us_data/tests/test_database.py
+++ b/policyengine_us_data/tests/test_database.py
@@ -11,7 +11,6 @@
     Target,
     create_database,
 )
-from policyengine_us_data.db import create_initial_strata
 
 
 @pytest.fixture
@@ -25,7 +24,7 @@ def test_stratum_hash_and_relationships(engine):
         stratum = Stratum(notes="test", stratum_group_id=0)
         stratum.constraints_rel = [
             StratumConstraint(
-                constraint_variable="ucgid_str", operation="in", value="0001"
+                constraint_variable="ucgid_str", operation="equals", value="0400000US30"
             ),
             StratumConstraint(
                 constraint_variable="age", operation="greater_than", value="20"
@@ -43,7 +42,7 @@ def test_stratum_hash_and_relationships(engine):
             "\n".join(
                 sorted(
                     [
-                        "ucgid_str|in|0001",
+                        "ucgid_str|equals|0400000US30",
                         "age|greater_than|20",
                         "age|less_than|65",
                     ]
@@ -61,7 +60,7 @@ def test_unique_definition_hash(engine):
         s1 = Stratum(stratum_group_id=0)
         s1.constraints_rel = [
             StratumConstraint(
-                constraint_variable="ucgid_str", operation="in", value="0001"
+                constraint_variable="ucgid_str", operation="equals", value="0400000US30"
             )
         ]
         session.add(s1)
@@ -69,42 +68,9 @@ def test_unique_definition_hash(engine):
         s2 = Stratum(stratum_group_id=0)
         s2.constraints_rel = [
             StratumConstraint(
-                constraint_variable="ucgid_str", operation="in", value="0001"
+                constraint_variable="ucgid_str", operation="equals", value="0400000US30"
             )
         ]
         session.add(s2)
         with pytest.raises(IntegrityError):
             session.commit()
-
-
-def test_create_initial_strata(monkeypatch, engine, tmp_path):
-    # ``monkeypatch`` is a pytest fixture that lets us temporarily modify or replace
-    # objects during a test. Here we use it to point ``STORAGE_FOLDER`` to a
-    # temporary directory so the test doesn't touch real data on disk.
-    monkeypatch.setattr(create_initial_strata, "STORAGE_FOLDER", tmp_path)
-
-    class FakeEnum(Enum):
-        NAT = "NAT"
-        STATE = "STATE"
-        DIST = "DIST"
-
-        def get_hierarchical_codes(self):
-            mapping = {
-                FakeEnum.NAT: ["NAT"],
-                FakeEnum.STATE: ["STATE", "NAT"],
-                FakeEnum.DIST: ["DIST", "STATE", "NAT"],
-            }
-            return mapping[self]
-
-    # Replace the real ``UCGID`` enumeration with our simplified version so the
-    # test can run without downloading geographic data.
-    monkeypatch.setattr(create_initial_strata, "UCGID", FakeEnum)
-    create_initial_strata.main()
-    with Session(engine) as session:
-        strata = session.exec(select(Stratum).order_by(Stratum.stratum_id)).all()
-        assert len(strata) == 3
-        nat, state, dist = strata
-        assert state.parent_stratum_id == nat.stratum_id
-        assert dist.parent_stratum_id == state.stratum_id
-        codes = [s.constraints_rel[0].value for s in strata]
-        assert codes == ["NAT", "STATE", "DIST"]
diff --git a/pyproject.toml b/pyproject.toml
index 16ca41f0..3e87a403 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,7 +42,6 @@ dependencies = [
     "sqlalchemy>=2.0.41",
     "sqlmodel>=0.0.24",
     "xlrd>=2.0.2",
-    "great_expectations>=0.18.0",
 ]
 
 [project.optional-dependencies]

From d295926887f14bf802ab23e3742ed7a3aaa942e5 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Mon, 18 Aug 2025 18:53:42 -0400
Subject: [PATCH 25/27] post lint

---
 .../db/create_database_tables.py                 |  6 ++++--
 policyengine_us_data/db/etl_irs_soi.py           | 16 +++++++++++-----
 policyengine_us_data/db/validate_database.py     | 11 ++++++-----
 policyengine_us_data/tests/test_database.py      | 12 +++++++++---
 4 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py
index 4c451467..df03772d 100644
--- a/policyengine_us_data/db/create_database_tables.py
+++ b/policyengine_us_data/db/create_database_tables.py
@@ -11,7 +11,7 @@
     SQLModel,
     create_engine,
 )
-from policyengine_us.system import system 
+from policyengine_us.system import system
 
 from policyengine_us_data.storage import STORAGE_FOLDER
 
@@ -25,7 +25,9 @@
 
 
 # An Enum type to ensure the variable exists in policyengine-us
-USVariable = Enum("USVariable", {name: name for name in system.variables.keys()}, type=str)
+USVariable = Enum(
+    "USVariable", {name: name for name in system.variables.keys()}, type=str
+)
 
 
 class Stratum(SQLModel, table=True):
diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
index bda932fd..e191acb2 100644
--- a/policyengine_us_data/db/etl_irs_soi.py
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -160,13 +160,19 @@ def transform_soi_data(raw_df):
         dict(code="59662", name="eitc", breakdown=("eitc_child_count", 1)),
         dict(code="59663", name="eitc", breakdown=("eitc_child_count", 2)),
         dict(code="59664", name="eitc", breakdown=("eitc_child_count", "3+")),
-        dict(code="59664", name="qualified_business_income_deduction", breakdown=None),
+        dict(
+            code="59664",
+            name="qualified_business_income_deduction",
+            breakdown=None,
+        ),
         dict(code="18500", name="real_estate_taxes", breakdown=None),
         dict(code="01000", name="net_capital_gain", breakdown=None),
         dict(code="03150", name="retirement_distributions", breakdown=None),
         dict(code="00300", name="taxable_interest_income", breakdown=None),
         dict(code="00400", name="tax_exempt_interest_income", breakdown=None),
-        dict(code="00600", name="non_qualified_dividend_income", breakdown=None),
+        dict(
+            code="00600", name="non_qualified_dividend_income", breakdown=None
+        ),
         dict(code="00650", name="qualified_dividend_income", breakdown=None),
         dict(
             code="26270",
@@ -329,7 +335,7 @@ def load_soi_data(long_dfs, year):
             if n_children == "3+":
                 new_stratum.constraints_rel.append(
                     StratumConstraint(
-                        constraint_variable="eitc_children",
+                        constraint_variable="eitc_child_count",
                         operation="greater_than",
                         value="2",
                     )
@@ -337,7 +343,7 @@ def load_soi_data(long_dfs, year):
             else:
                 new_stratum.constraints_rel.append(
                     StratumConstraint(
-                        constraint_variable="eitc_children",
+                        constraint_variable="eitc_child_count",
                         operation="equals",
                         value=f"{n_children}",
                     )
@@ -394,7 +400,7 @@ def load_soi_data(long_dfs, year):
 
     # Adjusted Gross Income ------
     agi_values = long_dfs[36]
-    assert agi_values[['target_variable']].values[0] == 'adjusted_gross_income'
+    assert agi_values[["target_variable"]].values[0] == "adjusted_gross_income"
 
     for i in range(agi_values.shape[0]):
         ucgid_i = agi_values[["ucgid_str"]].iloc[i].values[0]
diff --git a/policyengine_us_data/db/validate_database.py b/policyengine_us_data/db/validate_database.py
index 94dcdddf..fee6a49d 100644
--- a/policyengine_us_data/db/validate_database.py
+++ b/policyengine_us_data/db/validate_database.py
@@ -3,10 +3,11 @@
 validation track from the unit tests in policyengine_us_data/tests in that it tests
 the overall correctness of data after a full pipeline run with production data.
 """
+
 import sqlite3
 
 import pandas as pd
-from policyengine_us.system import system 
+from policyengine_us.system import system
 
 
 conn = sqlite3.connect("policyengine_us_data/storage/policy_data.db")
@@ -14,10 +15,10 @@
 stratum_constraints_df = pd.read_sql("SELECT * FROM stratum_constraints", conn)
 targets_df = pd.read_sql("SELECT * FROM targets", conn)
 
-for var_name in set(targets_df['variable']):
+for var_name in set(targets_df["variable"]):
     if not var_name in system.variables.keys():
-        raise ValueError(f'{var_name} not a policyengine-us variable')
+        raise ValueError(f"{var_name} not a policyengine-us variable")
 
-for var_name in set(stratum_constraints_df['constraint_variable']):
+for var_name in set(stratum_constraints_df["constraint_variable"]):
     if not var_name in system.variables.keys():
-        raise ValueError(f'{var_name} not a policyengine-us variable')
+        raise ValueError(f"{var_name} not a policyengine-us variable")
diff --git a/policyengine_us_data/tests/test_database.py b/policyengine_us_data/tests/test_database.py
index ce5eb211..64060b48 100644
--- a/policyengine_us_data/tests/test_database.py
+++ b/policyengine_us_data/tests/test_database.py
@@ -24,7 +24,9 @@ def test_stratum_hash_and_relationships(engine):
         stratum = Stratum(notes="test", stratum_group_id=0)
         stratum.constraints_rel = [
             StratumConstraint(
-                constraint_variable="ucgid_str", operation="equals", value="0400000US30"
+                constraint_variable="ucgid_str",
+                operation="equals",
+                value="0400000US30",
             ),
             StratumConstraint(
                 constraint_variable="age", operation="greater_than", value="20"
@@ -60,7 +62,9 @@ def test_unique_definition_hash(engine):
         s1 = Stratum(stratum_group_id=0)
         s1.constraints_rel = [
             StratumConstraint(
-                constraint_variable="ucgid_str", operation="equals", value="0400000US30"
+                constraint_variable="ucgid_str",
+                operation="equals",
+                value="0400000US30",
             )
         ]
         session.add(s1)
@@ -68,7 +72,9 @@ def test_unique_definition_hash(engine):
         s2 = Stratum(stratum_group_id=0)
         s2.constraints_rel = [
             StratumConstraint(
-                constraint_variable="ucgid_str", operation="equals", value="0400000US30"
+                constraint_variable="ucgid_str",
+                operation="equals",
+                value="0400000US30",
             )
         ]
         session.add(s2)

From bd104d002d05c86d680cfcc64d4448b0de324f07 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Tue, 19 Aug 2025 23:19:42 -0400
Subject: [PATCH 26/27] updating IRS target variables

---
 policyengine_us_data/db/etl_irs_soi.py | 34 +++++++++++++++++---------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
index e191acb2..91f40121 100644
--- a/policyengine_us_data/db/etl_irs_soi.py
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -161,28 +161,31 @@ def transform_soi_data(raw_df):
         dict(code="59663", name="eitc", breakdown=("eitc_child_count", 2)),
         dict(code="59664", name="eitc", breakdown=("eitc_child_count", "3+")),
         dict(
-            code="59664",
+            code="04475",
             name="qualified_business_income_deduction",
             breakdown=None,
         ),
         dict(code="18500", name="real_estate_taxes", breakdown=None),
         dict(code="01000", name="net_capital_gain", breakdown=None),
-        dict(code="03150", name="retirement_distributions", breakdown=None),
+        dict(code="01400", name="taxable_ira_distributions", breakdown=None),
         dict(code="00300", name="taxable_interest_income", breakdown=None),
         dict(code="00400", name="tax_exempt_interest_income", breakdown=None),
-        dict(
-            code="00600", name="non_qualified_dividend_income", breakdown=None
-        ),
+        dict(code="00600", name="dividend_income", breakdown=None),
         dict(code="00650", name="qualified_dividend_income", breakdown=None),
         dict(
             code="26270",
-            name="partnership_s_corp_income",
+            name="tax_unit_partnership_s_corp_income",
             breakdown=None,
         ),
-        dict(code="02500", name="social_security", breakdown=None),
+        dict(code="02500", name="taxable_social_security", breakdown=None),
         dict(code="02300", name="unemployment_compensation", breakdown=None),
-        dict(code="00700", name="salt_refund_income", breakdown=None),
-        dict(code="18425", name="reported_salt", breakdown=None),
+        dict(code="17000", name="medical_expense_deduction", breakdown=None),
+        dict(code="01700", name="taxable_pension_income", breakdown=None),
+        dict(code="11070", name="refundable_ctc", breakdown=None),
+        # NOTE: A18460 is the capped SALT deduction and matches the `salt` variable.
+        # Our SALT base currently excludes personal property taxes (not modeled yet),
+        # so amounts may be slightly below IRS totals.
+        dict(code="18460", name="salt", breakdown=None),
         dict(code="06500", name="income_tax", breakdown=None),
     ]
 
@@ -370,7 +373,14 @@ def load_soi_data(long_dfs, year):
     session.commit()
 
     # There are no breakdown variables used in the following set
-    for j in range(8, 36, 2):
+    first_agi_index = [
+        i
+        for i in range(len(long_dfs))
+        if long_dfs[i][["target_variable"]].values[0]
+        == "adjusted_gross_income"
+        and long_dfs[i][["breakdown_variable"]].values[0] == "one"
+    ][0]
+    for j in range(8, first_agi_index, 2):
         count_j, amount_j = long_dfs[j], long_dfs[j + 1]
         amount_variable_name = amount_j.iloc[0][["target_variable"]].values[0]
         print(
@@ -399,7 +409,7 @@ def load_soi_data(long_dfs, year):
     session.commit()
 
     # Adjusted Gross Income ------
-    agi_values = long_dfs[36]
+    agi_values = long_dfs[first_agi_index]
     assert agi_values[["target_variable"]].values[0] == "adjusted_gross_income"
 
     for i in range(agi_values.shape[0]):
@@ -421,7 +431,7 @@ def load_soi_data(long_dfs, year):
 
     agi_person_count_dfs = [
         df
-        for df in long_dfs[43:]
+        for df in long_dfs[(first_agi_index + 1) :]
         if df["target_variable"].iloc[0] == "person_count"
     ]
 

From 28753119e3e7c26663cf96a0d06902679903bd42 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Wed, 20 Aug 2025 13:53:22 -0400
Subject: [PATCH 27/27] changing the salt variable to uncapped

---
 policyengine_us_data/db/etl_irs_soi.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
index 91f40121..786abb1c 100644
--- a/policyengine_us_data/db/etl_irs_soi.py
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -182,10 +182,7 @@ def transform_soi_data(raw_df):
         dict(code="17000", name="medical_expense_deduction", breakdown=None),
         dict(code="01700", name="taxable_pension_income", breakdown=None),
         dict(code="11070", name="refundable_ctc", breakdown=None),
-        # NOTE: A18460 is the capped SALT deduction and matches the `salt` variable.
-        # Our SALT base currently excludes personal property taxes (not modeled yet),
-        # so amounts may be slightly below IRS totals.
-        dict(code="18460", name="salt", breakdown=None),
+        dict(code="18425", name="salt", breakdown=None),
         dict(code="06500", name="income_tax", breakdown=None),
     ]