From 35e2979aa48d56ce62a10465ee48a5c9a8eda390 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Wed, 2 Jul 2025 17:48:51 +0100 Subject: [PATCH 1/2] Reduce set of CPS PUF imputed variables --- changelog_entry.yaml | 4 ++ .../datasets/cps/enhanced_cps.py | 2 +- .../datasets/cps/extended_cps.py | 43 +------------------ 3 files changed, 6 insertions(+), 43 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..0d497ed9 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Reduced set of CPS imputed PUF variables for simplicity. diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index f8da784d..d9d5d3ac 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -78,7 +78,7 @@ def dropout_weights(weights, p): start_loss = None - iterator = trange(1_000 if not os.environ.get("TEST_LITE") else 500) + iterator = trange(500 if not os.environ.get("TEST_LITE") else 500) performance = pd.DataFrame() for i in iterator: optimizer.zero_grad() diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 7f952d56..136c8097 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -83,55 +83,15 @@ ] OVERRIDDEN_IMPUTED_VARIABLES = [ - "partnership_s_corp_income", "interest_deduction", - "unreimbursed_business_employee_expenses", - "pre_tax_contributions", "w2_wages_from_qualified_business", "unadjusted_basis_qualified_property", "business_is_sstb", "charitable_cash_donations", - "self_employed_pension_contribution_ald", - "unrecaptured_section_1250_gain", - "taxable_unemployment_compensation", - "domestic_production_ald", - "self_employed_health_insurance_ald", - "cdcc_relevant_expenses", - "salt_refund_income", - "foreign_tax_credit", - "estate_income", "charitable_non_cash_donations", - "american_opportunity_credit", - "miscellaneous_income", - "alimony_expense", - "health_savings_account_ald", - "non_sch_d_capital_gains", - "general_business_credit", - "energy_efficient_home_improvement_credit", - "amt_foreign_tax_credit", - "excess_withheld_payroll_tax", - "savers_credit", - "student_loan_interest", - "investment_income_elected_form_4952", - "early_withdrawal_penalty", - "prior_year_minimum_tax_credit", - "farm_rent_income", - "qualified_tuition_expenses", - "educator_expense", - "long_term_capital_gains_on_collectibles", "other_credits", "casualty_loss", - "unreported_payroll_tax", - "recapture_of_investment_credit", "deductible_mortgage_interest", - "qualified_reit_and_ptp_income", - "qualified_bdc_income", - "farm_operations_income", - "estate_income_would_be_qualified", - "farm_operations_income_would_be_qualified", - "farm_rent_income_would_be_qualified", - "partnership_s_corp_income_would_be_qualified", - "rental_income_would_be_qualified", ] @@ -146,8 +106,7 @@ def generate(self): cps_sim = Microsimulation(dataset=self.cps) puf_sim = Microsimulation(dataset=self.puf) - if os.environ.get("TEST_LITE"): - puf_sim.subsample(1_000) + puf_sim.subsample(100_000) INPUTS = [ "age", From f73b19084ec76a902d57aabbe38781ce78860f64 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Wed, 2 Jul 2025 17:59:19 +0100 Subject: [PATCH 2/2] Run more similar to prodx --- policyengine_us_data/datasets/cps/cps.py | 30 +++++-------------- .../datasets/cps/enhanced_cps.py | 2 +- policyengine_us_data/datasets/puf/puf.py | 10 +++---- policyengine_us_data/datasets/sipp/sipp.py | 2 +- 4 files changed, 14 insertions(+), 30 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index ae76e016..a759876a 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -169,9 +169,7 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame): }, na_action="ignore", ).fillna(train_df.tenure_type) - train_df = train_df[train_df.is_household_head].sample( - 100_000 if not test_lite else 1_000 - ) + train_df = train_df[train_df.is_household_head] inference_df = cps_sim.calculate_dataframe(PREDICTORS) mask = inference_df.is_household_head.values inference_df = inference_df[mask] @@ -1837,25 +1835,13 @@ def determine_reference_person(group): logging.getLogger("microimpute").setLevel(getattr(logging, log_level)) qrf_model = QRF() - if test_lite: - donor_data = donor_data.sample(frac=0.1, random_state=42).reset_index( - drop=True - ) - fitted_model = qrf_model.fit( - X_train=donor_data, - predictors=PREDICTORS, - imputed_variables=IMPUTED_VARIABLES, - weight_col=weights[0], - tune_hyperparameters=False, - ) - else: - fitted_model = qrf_model.fit( - X_train=donor_data, - predictors=PREDICTORS, - imputed_variables=IMPUTED_VARIABLES, - weight_col=weights[0], - tune_hyperparameters=False, - ) + fitted_model = qrf_model.fit( + X_train=donor_data, + predictors=PREDICTORS, + imputed_variables=IMPUTED_VARIABLES, + weight_col=weights[0], + tune_hyperparameters=False, + ) imputations = fitted_model.predict(X_test=receiver_data) for var in IMPUTED_VARIABLES: diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index d9d5d3ac..74460fe2 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -78,7 +78,7 @@ def dropout_weights(weights, p): start_loss = None - iterator = trange(500 if not os.environ.get("TEST_LITE") else 500) + iterator = trange(500) performance = pd.DataFrame() for i in iterator: optimizer.zero_grad() diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index f1a057c4..0c9ce157 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -168,8 +168,7 @@ def impute_pension_contributions_to_puf(puf_df): from policyengine_us_data.datasets.cps import CPS_2021 cps = Microsimulation(dataset=CPS_2021) - if os.environ.get("TEST_LITE"): - cps.subsample(1_000) + cps.subsample(1_000) cps_df = cps.calculate_dataframe( ["employment_income", "household_weight", "pre_tax_contributions"] ) @@ -198,10 +197,9 @@ def impute_missing_demographics( .fillna(0) ) - if os.environ.get("TEST_LITE"): - puf_with_demographics = puf_with_demographics.sample( - n=1_000, random_state=0 - ) + puf_with_demographics = puf_with_demographics.sample( + n=1_000, random_state=0 + ) DEMOGRAPHIC_VARIABLES = [ "AGEDP1", diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index 1d89a7d8..8f26aeb8 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -103,7 +103,7 @@ def train_tip_model(): sipp = sipp.loc[ np.random.choice( sipp.index, - size=100_000 if not test_lite else 1_000, + size=10_000, replace=True, p=sipp.household_weight / sipp.household_weight.sum(), )