PolicyEngine · eccuraa · Jul 1, 2025 · Jul 1, 2025 · Jul 1, 2025 · Jul 1, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,30 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.35.2] - 2025-07-02 15:31:46
+
+### Changed
+
+- Epochs increased to 1k.
+
+## [1.35.1] - 2025-07-02 15:00:11
+
+### Fixed
+
+- Imputed non-CPS income variables from the PUF.
+
+## [1.35.0] - 2025-07-01 23:42:47
+
+### Added
+
+- Normalisation of national and state targets.
+
+## [1.34.1] - 2025-07-01 22:12:13
+
+### Changed
+
+- Calibration epochs reduced to 500.
+
 ## [1.34.0] - 2025-07-01 20:10:32
 
 ### Added
@@ -472,6 +496,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[1.35.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.35.1...1.35.2
+[1.35.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.35.0...1.35.1
+[1.35.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.34.1...1.35.0
+[1.34.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.34.0...1.34.1
 [1.34.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.33.3...1.34.0
 [1.33.3]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.33.2...1.33.3
 [1.33.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.33.1...1.33.2

diff --git a/changelog.yaml b/changelog.yaml
@@ -393,3 +393,23 @@
     added:
     - State real estate taxes calibration targets.
   date: 2025-07-01 20:10:32
+- bump: patch
+  changes:
+    changed:
+    - Calibration epochs reduced to 500.
+  date: 2025-07-01 22:12:13
+- bump: minor
+  changes:
+    added:
+    - Normalisation of national and state targets.
+  date: 2025-07-01 23:42:47
+- bump: patch
+  changes:
+    fixed:
+    - Imputed non-CPS income variables from the PUF.
+  date: 2025-07-02 15:00:11
+- bump: patch
+  changes:
+    changed:
+    - Epochs increased to 1k.
+  date: 2025-07-02 15:31:46
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -30,7 +30,16 @@ def reweight(
     log_path="calibration_log.csv",
 ):
     target_names = np.array(loss_matrix.columns)
+    is_national = loss_matrix.columns.str.startswith("nation/")
     loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
+    nation_normalisation_factor = is_national * (1 / is_national.sum())
+    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
+    normalisation_factor = np.where(
+        is_national, nation_normalisation_factor, state_normalisation_factor
+    )
+    normalisation_factor = torch.tensor(
+        normalisation_factor, dtype=torch.float32
+    )
     targets_array = torch.tensor(targets_array, dtype=torch.float32)
     weights = torch.tensor(
         np.log(original_weights), requires_grad=True, dtype=torch.float32
@@ -49,9 +58,10 @@ def loss(weights):
         rel_error = (
             ((estimate - targets_array) + 1) / (targets_array + 1)
         ) ** 2
-        if torch.isnan(rel_error).any():
+        rel_error_normalized = rel_error * normalisation_factor
+        if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
-        return rel_error.mean()
+        return rel_error_normalized.mean()
 
     def dropout_weights(weights, p):
         if p == 0:
@@ -68,7 +78,7 @@ def dropout_weights(weights, p):
 
     start_loss = None
 
-    iterator = trange(5_000 if not os.environ.get("TEST_LITE") else 500)
+    iterator = trange(1_000 if not os.environ.get("TEST_LITE") else 500)
     performance = pd.DataFrame()
     for i in iterator:
         optimizer.zero_grad()

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -82,6 +82,58 @@
     "self_employment_income_would_be_qualified",
 ]
 
+OVERRIDDEN_IMPUTED_VARIABLES = [
+    "partnership_s_corp_income",
+    "interest_deduction",
+    "unreimbursed_business_employee_expenses",
+    "pre_tax_contributions",
+    "w2_wages_from_qualified_business",
+    "unadjusted_basis_qualified_property",
+    "business_is_sstb",
+    "charitable_cash_donations",
+    "self_employed_pension_contribution_ald",
+    "unrecaptured_section_1250_gain",
+    "taxable_unemployment_compensation",
+    "domestic_production_ald",
+    "self_employed_health_insurance_ald",
+    "cdcc_relevant_expenses",
+    "salt_refund_income",
+    "foreign_tax_credit",
+    "estate_income",
+    "charitable_non_cash_donations",
+    "american_opportunity_credit",
+    "miscellaneous_income",
+    "alimony_expense",
+    "health_savings_account_ald",
+    "non_sch_d_capital_gains",
+    "general_business_credit",
+    "energy_efficient_home_improvement_credit",
+    "amt_foreign_tax_credit",
+    "excess_withheld_payroll_tax",
+    "savers_credit",
+    "student_loan_interest",
+    "investment_income_elected_form_4952",
+    "early_withdrawal_penalty",
+    "prior_year_minimum_tax_credit",
+    "farm_rent_income",
+    "qualified_tuition_expenses",
+    "educator_expense",
+    "long_term_capital_gains_on_collectibles",
+    "other_credits",
+    "casualty_loss",
+    "unreported_payroll_tax",
+    "recapture_of_investment_credit",
+    "deductible_mortgage_interest",
+    "qualified_reit_and_ptp_income",
+    "qualified_bdc_income",
+    "farm_operations_income",
+    "estate_income_would_be_qualified",
+    "farm_operations_income_would_be_qualified",
+    "farm_rent_income_would_be_qualified",
+    "partnership_s_corp_income_would_be_qualified",
+    "rental_income_would_be_qualified",
+]
+
 
 class ExtendedCPS(Dataset):
     cps: Type[CPS]
@@ -107,24 +159,17 @@ def generate(self):
             "is_tax_unit_dependent",
         ]
 
-        X_train = puf_sim.calculate_dataframe(INPUTS)
-        y_train = puf_sim.calculate_dataframe(IMPUTED_VARIABLES)
-        X = cps_sim.calculate_dataframe(INPUTS)
-        y = pd.DataFrame(columns=IMPUTED_VARIABLES, index=X.index)
-
-        model = QRF()
-        start = time.time()
-        model.fit(
-            X_train,
-            y_train,
-        )
-        print(
-            f"Training imputation models from the PUF took {time.time() - start:.2f} seconds"
+        y_full_imputations = impute_income_variables(
+            cps_sim,
+            puf_sim,
+            predictors=INPUTS,
+            outputs=IMPUTED_VARIABLES,
         )
-        start = time.time()
-        y = model.predict(X)
-        print(
-            f"Predicting imputed values took {time.time() - start:.2f} seconds"
+        y_cps_imputations = impute_income_variables(
+            cps_sim,
+            puf_sim,
+            predictors=INPUTS,
+            outputs=OVERRIDDEN_IMPUTED_VARIABLES,
         )
         cps_sim = Microsimulation(dataset=self.cps)
         data = cps_sim.dataset.load_dataset()
@@ -138,8 +183,16 @@ def generate(self):
                 values = data[variable][...]
             else:
                 values = cps_sim.calculate(variable).values
-            if variable in IMPUTED_VARIABLES:
-                pred_values = y[variable].values
+            if variable in OVERRIDDEN_IMPUTED_VARIABLES:
+                pred_values = y_cps_imputations[variable].values
+                entity = variable_metadata.entity.key
+                if entity != "person":
+                    pred_values = cps_sim.populations[
+                        entity
+                    ].value_from_first_person(pred_values)
+                values = np.concatenate([pred_values, pred_values])
+            elif variable in IMPUTED_VARIABLES:
+                pred_values = y_full_imputations[variable].values
                 entity = variable_metadata.entity.key
                 if entity != "person":
                     pred_values = cps_sim.populations[
@@ -161,6 +214,31 @@ def generate(self):
         self.save_dataset(new_data)
 
 
+def impute_income_variables(
+    cps_sim,
+    puf_sim,
+    predictors: list[str] = None,
+    outputs: list[str] = None,
+):
+    X_train = puf_sim.calculate_dataframe(predictors)
+    y_train = puf_sim.calculate_dataframe(outputs)
+    X = cps_sim.calculate_dataframe(predictors)
+    y = pd.DataFrame(columns=outputs, index=X.index)
+    model = QRF()
+    start = time.time()
+    model.fit(
+        X_train,
+        y_train,
+    )
+    print(
+        f"Training imputation models from the PUF took {time.time() - start:.2f} seconds"
+    )
+    start = time.time()
+    y = model.predict(X)
+    print(f"Predicting imputed values took {time.time() - start:.2f} seconds")
+    return y
+
+
 class ExtendedCPS_2024(ExtendedCPS):
     cps = CPS_2024
     puf = PUF_2024