Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,30 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.35.2] - 2025-07-02 15:31:46

### Changed

- Epochs increased to 1k.

## [1.35.1] - 2025-07-02 15:00:11

### Fixed

- Imputed non-CPS income variables from the PUF.

## [1.35.0] - 2025-07-01 23:42:47

### Added

- Normalisation of national and state targets.

## [1.34.1] - 2025-07-01 22:12:13

### Changed

- Calibration epochs reduced to 500.

## [1.34.0] - 2025-07-01 20:10:32

### Added
Expand Down Expand Up @@ -472,6 +496,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0



[1.35.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.35.1...1.35.2
[1.35.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.35.0...1.35.1
[1.35.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.34.1...1.35.0
[1.34.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.34.0...1.34.1
[1.34.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.33.3...1.34.0
[1.33.3]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.33.2...1.33.3
[1.33.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.33.1...1.33.2
Expand Down
20 changes: 20 additions & 0 deletions changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -393,3 +393,23 @@
added:
- State real estate taxes calibration targets.
date: 2025-07-01 20:10:32
- bump: patch
changes:
changed:
- Calibration epochs reduced to 500.
date: 2025-07-01 22:12:13
- bump: minor
changes:
added:
- Normalisation of national and state targets.
date: 2025-07-01 23:42:47
- bump: patch
changes:
fixed:
- Imputed non-CPS income variables from the PUF.
date: 2025-07-02 15:00:11
- bump: patch
changes:
changed:
- Epochs increased to 1k.
date: 2025-07-02 15:31:46
16 changes: 13 additions & 3 deletions policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,16 @@ def reweight(
log_path="calibration_log.csv",
):
target_names = np.array(loss_matrix.columns)
is_national = loss_matrix.columns.str.startswith("nation/")
loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
nation_normalisation_factor = is_national * (1 / is_national.sum())
state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
normalisation_factor = np.where(
is_national, nation_normalisation_factor, state_normalisation_factor
)
normalisation_factor = torch.tensor(
normalisation_factor, dtype=torch.float32
)
targets_array = torch.tensor(targets_array, dtype=torch.float32)
weights = torch.tensor(
np.log(original_weights), requires_grad=True, dtype=torch.float32
Expand All @@ -49,9 +58,10 @@ def loss(weights):
rel_error = (
((estimate - targets_array) + 1) / (targets_array + 1)
) ** 2
if torch.isnan(rel_error).any():
rel_error_normalized = rel_error * normalisation_factor
if torch.isnan(rel_error_normalized).any():
raise ValueError("Relative error contains NaNs")
return rel_error.mean()
return rel_error_normalized.mean()

def dropout_weights(weights, p):
if p == 0:
Expand All @@ -68,7 +78,7 @@ def dropout_weights(weights, p):

start_loss = None

iterator = trange(5_000 if not os.environ.get("TEST_LITE") else 500)
iterator = trange(1_000 if not os.environ.get("TEST_LITE") else 500)
performance = pd.DataFrame()
for i in iterator:
optimizer.zero_grad()
Expand Down
116 changes: 97 additions & 19 deletions policyengine_us_data/datasets/cps/extended_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,58 @@
"self_employment_income_would_be_qualified",
]

OVERRIDDEN_IMPUTED_VARIABLES = [
"partnership_s_corp_income",
"interest_deduction",
"unreimbursed_business_employee_expenses",
"pre_tax_contributions",
"w2_wages_from_qualified_business",
"unadjusted_basis_qualified_property",
"business_is_sstb",
"charitable_cash_donations",
"self_employed_pension_contribution_ald",
"unrecaptured_section_1250_gain",
"taxable_unemployment_compensation",
"domestic_production_ald",
"self_employed_health_insurance_ald",
"cdcc_relevant_expenses",
"salt_refund_income",
"foreign_tax_credit",
"estate_income",
"charitable_non_cash_donations",
"american_opportunity_credit",
"miscellaneous_income",
"alimony_expense",
"health_savings_account_ald",
"non_sch_d_capital_gains",
"general_business_credit",
"energy_efficient_home_improvement_credit",
"amt_foreign_tax_credit",
"excess_withheld_payroll_tax",
"savers_credit",
"student_loan_interest",
"investment_income_elected_form_4952",
"early_withdrawal_penalty",
"prior_year_minimum_tax_credit",
"farm_rent_income",
"qualified_tuition_expenses",
"educator_expense",
"long_term_capital_gains_on_collectibles",
"other_credits",
"casualty_loss",
"unreported_payroll_tax",
"recapture_of_investment_credit",
"deductible_mortgage_interest",
"qualified_reit_and_ptp_income",
"qualified_bdc_income",
"farm_operations_income",
"estate_income_would_be_qualified",
"farm_operations_income_would_be_qualified",
"farm_rent_income_would_be_qualified",
"partnership_s_corp_income_would_be_qualified",
"rental_income_would_be_qualified",
]


class ExtendedCPS(Dataset):
cps: Type[CPS]
Expand All @@ -107,24 +159,17 @@ def generate(self):
"is_tax_unit_dependent",
]

X_train = puf_sim.calculate_dataframe(INPUTS)
y_train = puf_sim.calculate_dataframe(IMPUTED_VARIABLES)
X = cps_sim.calculate_dataframe(INPUTS)
y = pd.DataFrame(columns=IMPUTED_VARIABLES, index=X.index)

model = QRF()
start = time.time()
model.fit(
X_train,
y_train,
)
print(
f"Training imputation models from the PUF took {time.time() - start:.2f} seconds"
y_full_imputations = impute_income_variables(
cps_sim,
puf_sim,
predictors=INPUTS,
outputs=IMPUTED_VARIABLES,
)
start = time.time()
y = model.predict(X)
print(
f"Predicting imputed values took {time.time() - start:.2f} seconds"
y_cps_imputations = impute_income_variables(
cps_sim,
puf_sim,
predictors=INPUTS,
outputs=OVERRIDDEN_IMPUTED_VARIABLES,
)
cps_sim = Microsimulation(dataset=self.cps)
data = cps_sim.dataset.load_dataset()
Expand All @@ -138,8 +183,16 @@ def generate(self):
values = data[variable][...]
else:
values = cps_sim.calculate(variable).values
if variable in IMPUTED_VARIABLES:
pred_values = y[variable].values
if variable in OVERRIDDEN_IMPUTED_VARIABLES:
pred_values = y_cps_imputations[variable].values
entity = variable_metadata.entity.key
if entity != "person":
pred_values = cps_sim.populations[
entity
].value_from_first_person(pred_values)
values = np.concatenate([pred_values, pred_values])
elif variable in IMPUTED_VARIABLES:
pred_values = y_full_imputations[variable].values
entity = variable_metadata.entity.key
if entity != "person":
pred_values = cps_sim.populations[
Expand All @@ -161,6 +214,31 @@ def generate(self):
self.save_dataset(new_data)


def impute_income_variables(
cps_sim,
puf_sim,
predictors: list[str] = None,
outputs: list[str] = None,
):
X_train = puf_sim.calculate_dataframe(predictors)
y_train = puf_sim.calculate_dataframe(outputs)
X = cps_sim.calculate_dataframe(predictors)
y = pd.DataFrame(columns=outputs, index=X.index)
model = QRF()
start = time.time()
model.fit(
X_train,
y_train,
)
print(
f"Training imputation models from the PUF took {time.time() - start:.2f} seconds"
)
start = time.time()
y = model.predict(X)
print(f"Predicting imputed values took {time.time() - start:.2f} seconds")
return y


class ExtendedCPS_2024(ExtendedCPS):
cps = CPS_2024
puf = PUF_2024
Expand Down
Loading
Loading