Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
cd6cf58
first round of eitc targets are added
baogorek Jul 29, 2025
867bec6
linting
baogorek Jul 29, 2025
c2dd4af
changelog_entry.yaml
baogorek Jul 29, 2025
9f7f674
merging main
baogorek Jul 29, 2025
92979ca
Merge branch 'main' of github.com:PolicyEngine/policyengine-us-data i…
baogorek Aug 2, 2025
95a4a9a
new file in progress
baogorek Aug 2, 2025
6fd3542
moving to QBID and SALT
baogorek Aug 7, 2025
c73ef87
new variables added
baogorek Aug 7, 2025
57d9850
medicaid etl file
baogorek Aug 8, 2025
33cf8e9
merging main
baogorek Aug 11, 2025
9c4838e
medicaid is loading in
baogorek Aug 11, 2025
57716f2
medicaid and some SNAP data
baogorek Aug 12, 2025
7b3cacc
got SNAP settled
baogorek Aug 12, 2025
e45072e
progress
baogorek Aug 12, 2025
6d482e7
all major targets loaded
baogorek Aug 14, 2025
dddf689
linting
baogorek Aug 14, 2025
9c3a460
fixed national stratum in agi script
baogorek Aug 15, 2025
81e2011
refactor: use sqlmodel session
baogorek Aug 15, 2025
d5b3571
storage file updates
baogorek Aug 15, 2025
a729450
Merge branch 'treasury' into codex/fix-orm-inconsistencies-and-improv…
baogorek Aug 15, 2025
26b561b
Merge pull request #430 from PolicyEngine/codex/fix-orm-inconsistenci…
baogorek Aug 15, 2025
a1de133
Store policy database in storage folder
baogorek Aug 15, 2025
b7867db
Merge pull request #431 from PolicyEngine/codex/update-storage-path-f…
baogorek Aug 15, 2025
b376726
adding make database to reusable test. Updating changelog_entry
baogorek Aug 15, 2025
9078ed9
removing TODOs
baogorek Aug 15, 2025
9913e3c
Removed troublesome logging. Updated Makefile
baogorek Aug 15, 2025
fddc3ac
updated comments based on feedback. Removed old make target
baogorek Aug 15, 2025
0cf920a
test: move database tests into package
baogorek Aug 18, 2025
35f78cd
Merge pull request #433 from PolicyEngine/codex/add-tests-for-policye…
baogorek Aug 18, 2025
0571ff5
Add Great Expectations validation for database
baogorek Aug 18, 2025
648eabf
Merge pull request #434 from PolicyEngine/codex/add-great-expectation…
baogorek Aug 18, 2025
bdef501
working pre lint
baogorek Aug 18, 2025
d295926
post lint
baogorek Aug 18, 2025
bd104d0
updating IRS target variables
baogorek Aug 20, 2025
2875311
changing the salt variable to uncapped
baogorek Aug 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/reusable_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ jobs:
if: inputs.full_suite
run: make download

- name: Create and load calibration targets database
if: inputs.full_suite
run: make database

- name: Build datasets
if: inputs.full_suite
run: make data
Expand Down Expand Up @@ -90,4 +94,4 @@ jobs:
with:
branch: gh-pages
folder: docs/_build/html
clean: true
clean: true
17 changes: 7 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,6 @@ changelog:
download:
python policyengine_us_data/storage/download_private_prerequisites.py

targets:
python policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py
python policyengine_us_data/storage/calibration_targets/pull_age_targets.py
python policyengine_us_data/storage/calibration_targets/pull_soi_targets.py
python policyengine_us_data/storage/calibration_targets/pull_snap_targets.py

upload:
python policyengine_us_data/storage/upload_completed_datasets.py

Expand Down Expand Up @@ -61,10 +55,12 @@ documentation-dev:

database:
python policyengine_us_data/db/create_database_tables.py
python policyengine_us_data/db/load_age_targets.py

clean-database:
rm *.db
python policyengine_us_data/db/create_initial_strata.py
python policyengine_us_data/db/etl_age.py
python policyengine_us_data/db/etl_medicaid.py
python policyengine_us_data/db/etl_snap.py
python policyengine_us_data/db/etl_irs_soi.py
python policyengine_us_data/db/validate_database.py

data:
python policyengine_us_data/utils/uprating.py
Expand All @@ -80,6 +76,7 @@ data:

clean:
rm -f policyengine_us_data/storage/*.h5
rm -f policyengine_us_data/storage/*.db
git clean -fX -- '*.csv'
rm -rf policyengine_us_data/docs/_build

Expand Down
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: minor
changes:
added:
- add SQLite database for calibration targets
23 changes: 16 additions & 7 deletions policyengine_us_data/db/create_database_tables.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
import logging
import hashlib
from typing import List, Optional
from enum import Enum

from sqlalchemy import event, UniqueConstraint
from sqlalchemy.orm.attributes import get_history

from sqlmodel import (
Field,
Relationship,
SQLModel,
create_engine,
)
from policyengine_us.system import system

from policyengine_us_data.storage import STORAGE_FOLDER


logging.basicConfig(
level=logging.INFO,
Expand All @@ -20,6 +24,12 @@
logger = logging.getLogger(__name__)


# An Enum type to ensure the variable exists in policyengine-us
USVariable = Enum(
"USVariable", {name: name for name in system.variables.keys()}, type=str
)


class Stratum(SQLModel, table=True):
"""Represents a unique population subgroup (stratum)."""

Expand Down Expand Up @@ -79,7 +89,7 @@ class StratumConstraint(SQLModel, table=True):
__tablename__ = "stratum_constraints"

stratum_id: int = Field(foreign_key="strata.stratum_id", primary_key=True)
constraint_variable: str = Field(
constraint_variable: USVariable = Field(
primary_key=True,
description="The variable the constraint applies to (e.g., 'age').",
)
Expand Down Expand Up @@ -112,7 +122,7 @@ class Target(SQLModel, table=True):
)

target_id: Optional[int] = Field(default=None, primary_key=True)
variable: str = Field(
variable: USVariable = Field(
description="A variable defined in policyengine-us (e.g., 'income_tax')."
)
period: int = Field(
Expand Down Expand Up @@ -171,12 +181,11 @@ def calculate_definition_hash(mapper, connection, target: Stratum):
fingerprint_text = "\n".join(constraint_strings)
h = hashlib.sha256(fingerprint_text.encode("utf-8"))
target.definition_hash = h.hexdigest()
logger.info(
f"Set definition_hash for Stratum to '{target.definition_hash}'"
)


def create_database(db_uri="sqlite:///policy_data.db"):
def create_database(
db_uri: str = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}",
):
"""
Creates a SQLite database and all the defined tables.

Expand Down
75 changes: 75 additions & 0 deletions policyengine_us_data/db/create_initial_strata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from typing import Dict

import pandas as pd
from sqlmodel import Session, create_engine

from policyengine_us_data.storage import STORAGE_FOLDER


from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import (
UCGID,
)
from policyengine_us_data.db.create_database_tables import (
Stratum,
StratumConstraint,
)


def main():
# Get the implied hierarchy by the UCGID enum --------
rows = []
for node in UCGID:
codes = node.get_hierarchical_codes()
rows.append(
{
"name": node.name,
"code": codes[0],
"parent": codes[1] if len(codes) > 1 else None,
}
)

hierarchy_df = (
pd.DataFrame(rows)
.sort_values(["parent", "code"], na_position="first")
.reset_index(drop=True)
)

DATABASE_URL = f"sqlite:///{STORAGE_FOLDER / 'policy_data.db'}"
engine = create_engine(DATABASE_URL)

# map the ucgid_str 'code' to auto-generated 'stratum_id'
code_to_stratum_id: Dict[str, int] = {}

with Session(engine) as session:
for _, row in hierarchy_df.iterrows():
parent_code = row["parent"]

parent_id = (
code_to_stratum_id.get(parent_code) if parent_code else None
)

new_stratum = Stratum(
parent_stratum_id=parent_id,
notes=f'{row["name"]} (ucgid {row["code"]})',
stratum_group_id=1,
)

new_stratum.constraints_rel = [
StratumConstraint(
constraint_variable="ucgid_str",
operation="in",
value=row["code"],
)
]

session.add(new_stratum)

session.flush()

code_to_stratum_id[row["code"]] = new_stratum.stratum_id

session.commit()


if __name__ == "__main__":
main()
Loading
Loading