PolicyEngine · juaristi22 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+    - Enhanced CPS minimizing tests. 
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -22,13 +22,27 @@
     torch = None
 
 
+bad_targets = [
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+]
+
+
 def reweight(
     original_weights,
     loss_matrix,
     targets_array,
     dropout_rate=0.05,
-    log_path="calibration_log.csv",
     epochs=150,
+    log_path="calibration_log.csv",
+    penalty_approach=None,
+    penalty_weight=None,
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -46,8 +60,12 @@ def reweight(
         np.log(original_weights), requires_grad=True, dtype=torch.float32
     )
 
-    # TODO: replace this functionality from the microcalibrate package.
-    def loss(weights):
+    # TO DO: replace this with a call to the python reweight.py package.
+    def loss(
+        weights,
+        penalty_approach=penalty_approach,
+        penalty_weight=penalty_weight,
+    ):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -60,9 +78,51 @@ def loss(weights):
             ((estimate - targets_array) + 1) / (targets_array + 1)
         ) ** 2
         rel_error_normalized = rel_error * normalisation_factor
+
         if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
-        return rel_error_normalized.mean()
+
+        if penalty_approach is not None and penalty_weight is not None:
+            # L0 penalty (approximated with smooth function)
+            # Since L0 is non-differentiable, we use a smooth approximation
+            # Common approaches:
+
+            epsilon = 1e-3  # Threshold for "near zero"
+
+            # Option 1: Sigmoid approximation
+            if penalty_approach == "l0_sigmoid":
+                smoothed_l0 = torch.sigmoid(
+                    (weights - epsilon) / (epsilon * 0.1)
+                ).mean()
+
+            # Option 2: Log-sum penalty (smoother)
+            if penalty_approach == "l0_log":
+                smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(
+                    weights
+                )
+
+            # Option 3: Exponential penalty
+            if penalty_approach == "l0_exp":
+                smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
+
+            if penalty_approach == "l1":
+                l1 = torch.mean(weights)
+                return rel_error_normalized.mean() + penalty_weight * l1
+
+            return rel_error_normalized.mean() + penalty_weight * smoothed_l0
+
+        else:
+            return rel_error_normalized.mean()
+
+    def prune_dataset(weights, epsilon=1e-3):
+        """
+        Prune dataset samples based on learned weights.
+        Returns indices of samples to keep.
+        """
+        importance_scores = weights.detach().cpu().numpy()
+        keep_indices = np.where(importance_scores > epsilon)[0]
+
+        return keep_indices
 
     def dropout_weights(weights, p):
         if p == 0:
@@ -207,9 +267,9 @@ def generate(self):
             loss_matrix, targets_array = build_loss_matrix(
                 self.input_dataset, year
             )
-            zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
+
             bad_mask = loss_matrix.columns.isin(bad_targets)
-            keep_mask_bool = ~(zero_mask | bad_mask)
+            keep_mask_bool = ~bad_mask
             keep_idx = np.where(keep_mask_bool)[0]
             loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
             targets_array_clean = targets_array[keep_idx]
@@ -220,7 +280,7 @@ def generate(self):
                 loss_matrix_clean,
                 targets_array_clean,
                 log_path="calibration_log.csv",
-                epochs=150,
+                epochs= 150,
             )
             data["household_weight"][year] = optimised_weights
 

diff --git a/policyengine_us_data/storage/upload_completed_datasets.py b/policyengine_us_data/storage/upload_completed_datasets.py
@@ -15,6 +15,7 @@ def upload_datasets():
         Pooled_3_Year_CPS_2023.file_path,
         CPS_2023.file_path,
         STORAGE_FOLDER / "small_enhanced_cps_2024.h5",
+        STORAGE_FOLDER / "enhanced_cps_2024_minified.h5",
     ]
 
     for file_path in dataset_files:

diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
@@ -552,11 +552,6 @@ def build_loss_matrix(dataset: type, time_period):
         # Convert to thousands for the target
         targets_array.append(row["enrollment"])
 
-        print(
-            f"Targeting Medicaid enrollment for {row['state']} "
-            f"with target {row['enrollment']:.0f}k"
-        )
-
     # State 10-year age targets
 
     age_targets = pd.read_csv(STORAGE_FOLDER / "age_state.csv")