From 4c0f1c611e85436fe5b1c0e1c87deb386846d761 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Thu, 10 Jul 2025 21:20:39 +0100
Subject: [PATCH 01/56] Shrink datasets

---
 policyengine_us_data/utils/minimise.py | 85 ++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 policyengine_us_data/utils/minimise.py

diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
new file mode 100644
index 00000000..4355e889
--- /dev/null
+++ b/policyengine_us_data/utils/minimise.py
@@ -0,0 +1,85 @@
+from policyengine_us_data.utils.loss import build_loss_matrix
+from policyengine_core.data import Dataset
+from policyengine_us import Microsimulation
+import numpy as np
+import pandas as pd
+
+def minimise_dataset(dataset, output_path: str, loss_rel_change_max: float) -> None:
+    # if loading from a .h5 file, need to do dataset = Dataset.from_file(dataset)
+    loss_matrix = build_loss_matrix(dataset, 2024)
+
+    sim = Microsimulation(dataset=dataset)
+
+    weights = sim.calculate("household_weight", 2024).values
+    estimate_matrix, targets = loss_matrix
+    is_national = estimate_matrix.columns.str.startswith("nation/")
+    nation_normalisation_factor = is_national * (1 / is_national.sum())
+    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
+    normalisation_factor = np.where(
+        is_national, nation_normalisation_factor, state_normalisation_factor
+    )
+    weights @ estimate_matrix
+
+    def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor):
+        """
+        Calculate the loss based on the inclusion mask and the estimate matrix.
+        """
+        masked_weights = weights.copy()
+        original_weight_total = masked_weights.sum()
+        masked_weights[~inclusion_mask] = 0
+        masked_weight_total = masked_weights.sum()
+        masked_weights[inclusion_mask] *= original_weight_total / masked_weight_total
+        estimates = masked_weights @ estimate_matrix
+        rel_error = ((estimates - targets) + 1) / (targets + 1)
+        loss = ((rel_error * normalisation_factor) ** 2).mean()
+
+        return loss
+
+    COUNT_ITERATIONS = 5
+    FRACTION_REMOVE_PER_ITERATION = 0.1
+    from tqdm import tqdm
+
+    full_mask = np.ones_like(weights, dtype=bool)
+    for i in range(COUNT_ITERATIONS):
+        inclusion_mask = full_mask.copy()
+        baseline_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor)
+        household_loss_rel_changes = []
+        for household_index in tqdm(range(len(weights))):
+            # Skip if this household is already excluded
+            if not inclusion_mask[household_index]:
+                household_loss_rel_changes.append(np.inf)
+                continue
+            # Calculate loss if this household is removed
+            inclusion_mask = inclusion_mask.copy()
+            inclusion_mask[household_index] = False
+            loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor)
+            rel_change = (loss - baseline_loss) / baseline_loss
+            household_loss_rel_changes.append(rel_change)
+        inclusion_mask = full_mask.copy()
+        household_loss_rel_changes = np.array(household_loss_rel_changes)
+        # Sort by the relative change in loss
+        sorted_indices = np.argsort(household_loss_rel_changes)
+        # Remove the worst households
+        num_to_remove = int(len(weights) * FRACTION_REMOVE_PER_ITERATION)
+        worst_indices = sorted_indices[:num_to_remove]
+        inclusion_mask[worst_indices] = False
+        # Calculate the new loss
+        new_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor)
+        print(f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}")
+        print(f"Removed {num_to_remove} households with worst relative loss changes.")
+        # Update the full mask
+        full_mask &= inclusion_mask
+    
+    household_ids = sim.calculate("household_id", 2024).values
+    remaining_households = household_ids[full_mask]
+
+    # At this point we have a mask of households to keep
+
+    # I'm saving to a csv for ease of debugging, but we need to save to a .h5 file
+
+    df = sim.to_input_dataframe()
+    df = df[df["household_id__2024"].isin(remaining_households)]
+
+    df.to_csv(output_path, index=False)
+
+    return df
\ No newline at end of file

From 6b2a56f6f8a55aacb4ee9e305bd53c74f36c70b0 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Thu, 10 Jul 2025 22:25:27 +0100
Subject: [PATCH 02/56] Move to package

---
 Makefile                                      |   1 +
 .../storage/upload_completed_datasets.py      |   1 +
 policyengine_us_data/utils/minimise.py        | 127 +++++++++++++++---
 3 files changed, 114 insertions(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index 788ba1d3..90b2817a 100644
--- a/Makefile
+++ b/Makefile
@@ -46,6 +46,7 @@ data:
 	python policyengine_us_data/datasets/cps/extended_cps.py
 	python policyengine_us_data/datasets/cps/enhanced_cps.py
 	python policyengine_us_data/datasets/cps/small_enhanced_cps.py
+	python policyengine_us_data/utils/minimise.py
 
 clean:
 	rm -f policyengine_us_data/storage/*.h5
diff --git a/policyengine_us_data/storage/upload_completed_datasets.py b/policyengine_us_data/storage/upload_completed_datasets.py
index f161a9ee..16885d8c 100644
--- a/policyengine_us_data/storage/upload_completed_datasets.py
+++ b/policyengine_us_data/storage/upload_completed_datasets.py
@@ -15,6 +15,7 @@ def upload_datasets():
         Pooled_3_Year_CPS_2023.file_path,
         CPS_2023.file_path,
         STORAGE_FOLDER / "small_enhanced_cps_2024.h5",
+        STORAGE_FOLDER / "enhanced_cps_2024_minified.h5",
     ]
 
     for file_path in dataset_files:
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 4355e889..6fe511fd 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -3,9 +3,43 @@
 from policyengine_us import Microsimulation
 import numpy as np
 import pandas as pd
+import h5py
+from policyengine_us_data.storage import STORAGE_FOLDER
+
+
+def create_calibration_log_file(file_path):
+    dataset = Dataset.from_file(file_path)
+
+    loss_matrix = build_loss_matrix(dataset, 2024)
+
+    sim = Microsimulation(dataset=dataset)
+
+    estimates = sim.calculate("household_weight", 2024).values @ loss_matrix[0]
+    target_names = loss_matrix[0].columns
+    target_values = loss_matrix[1]
+
+    df = pd.DataFrame(
+        {
+            "target_name": target_names,
+            "estimate": estimates,
+            "target": target_values,
+        }
+    )
+    df["epoch"] = 0
+    df["error"] = df["estimate"] - df["target"]
+    df["rel_error"] = df["error"] / df["target"]
+    df["abs_error"] = df["error"].abs()
+    df["rel_abs_error"] = df["abs_error"] / df["target"].abs()
+    df["loss"] = (df["rel_error"] ** 2).mean()
+
+    df.to_csv(file_path.replace(".h5", "_calibration_log.csv"), index=False)
+
+
+def minimise_dataset(
+    dataset, output_path: str, loss_rel_change_max: float
+) -> None:
+    create_calibration_log_file(dataset)
 
-def minimise_dataset(dataset, output_path: str, loss_rel_change_max: float) -> None:
-    # if loading from a .h5 file, need to do dataset = Dataset.from_file(dataset)
     loss_matrix = build_loss_matrix(dataset, 2024)
 
     sim = Microsimulation(dataset=dataset)
@@ -20,15 +54,20 @@ def minimise_dataset(dataset, output_path: str, loss_rel_change_max: float) -> N
     )
     weights @ estimate_matrix
 
-    def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor):
+    def get_loss_from_mask(
+        inclusion_mask, estimate_matrix, targets, normalisation_factor
+    ):
         """
         Calculate the loss based on the inclusion mask and the estimate matrix.
         """
         masked_weights = weights.copy()
         original_weight_total = masked_weights.sum()
-        masked_weights[~inclusion_mask] = 0
+        if (~inclusion_mask).sum() > 0:
+            masked_weights[~inclusion_mask] = 0
         masked_weight_total = masked_weights.sum()
-        masked_weights[inclusion_mask] *= original_weight_total / masked_weight_total
+        masked_weights[inclusion_mask] *= (
+            original_weight_total / masked_weight_total
+        )
         estimates = masked_weights @ estimate_matrix
         rel_error = ((estimates - targets) + 1) / (targets + 1)
         loss = ((rel_error * normalisation_factor) ** 2).mean()
@@ -36,15 +75,23 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f
         return loss
 
     COUNT_ITERATIONS = 5
+    VIEW_FRACTION_PER_ITERATION = 0.3
     FRACTION_REMOVE_PER_ITERATION = 0.1
     from tqdm import tqdm
 
     full_mask = np.ones_like(weights, dtype=bool)
     for i in range(COUNT_ITERATIONS):
         inclusion_mask = full_mask.copy()
-        baseline_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor)
+        baseline_loss = get_loss_from_mask(
+            inclusion_mask, estimate_matrix, targets, normalisation_factor
+        )
         household_loss_rel_changes = []
-        for household_index in tqdm(range(len(weights))):
+        indices = np.random.choice(
+            np.arange(len(weights)),
+            size=int(len(weights) * VIEW_FRACTION_PER_ITERATION),
+            replace=False,
+        )
+        for household_index in tqdm(indices):
             # Skip if this household is already excluded
             if not inclusion_mask[household_index]:
                 household_loss_rel_changes.append(np.inf)
@@ -52,7 +99,9 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f
             # Calculate loss if this household is removed
             inclusion_mask = inclusion_mask.copy()
             inclusion_mask[household_index] = False
-            loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor)
+            loss = get_loss_from_mask(
+                inclusion_mask, estimate_matrix, targets, normalisation_factor
+            )
             rel_change = (loss - baseline_loss) / baseline_loss
             household_loss_rel_changes.append(rel_change)
         inclusion_mask = full_mask.copy()
@@ -64,12 +113,24 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f
         worst_indices = sorted_indices[:num_to_remove]
         inclusion_mask[worst_indices] = False
         # Calculate the new loss
-        new_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor)
-        print(f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}")
-        print(f"Removed {num_to_remove} households with worst relative loss changes.")
+        new_loss = get_loss_from_mask(
+            inclusion_mask, estimate_matrix, targets, normalisation_factor
+        )
+        rel_change = (new_loss - baseline_loss) / baseline_loss
+        if rel_change > loss_rel_change_max:
+            print(
+                f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, which is too high ({rel_change:.2%}). Stopping."
+            )
+            break
+        print(
+            f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}"
+        )
+        print(
+            f"Removed {num_to_remove} households with worst relative loss changes."
+        )
         # Update the full mask
         full_mask &= inclusion_mask
-    
+
     household_ids = sim.calculate("household_id", 2024).values
     remaining_households = household_ids[full_mask]
 
@@ -78,8 +139,44 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f
     # I'm saving to a csv for ease of debugging, but we need to save to a .h5 file
 
     df = sim.to_input_dataframe()
-    df = df[df["household_id__2024"].isin(remaining_households)]
+    smaller_df = df[df["household_id__2024"].isin(remaining_households)]
+
+    weight_rel_change = (
+        smaller_df["household_weight__2024"].sum()
+        / df["household_weight__2024"].sum()
+    )
+    print(f"Weight relative change: {weight_rel_change:.2%}")
+
+    sim = Microsimulation(dataset=smaller_df)
+
+    sim.set_input(
+        "household_weight",
+        2024,
+        sim.calculate("household_weight", 2024).values / weight_rel_change,
+    )
+
+    data = {}
+
+    for variable in sim.input_variables:
+        data[variable] = {2024: sim.calculate(variable, 2024).values}
+        if data[variable][2024].dtype == "object":
+            data[variable][2024] = data[variable][2024].astype("S")
+
+    with h5py.File(output_path, "w") as f:
+        for variable, values in data.items():
+            for year, value in values.items():
+                f.create_dataset(f"{variable}/{year}", data=value)
+    print(f"Saved minimised dataset to {output_path}")
+
+    create_calibration_log_file(output_path)
+
 
-    df.to_csv(output_path, index=False)
+if __name__ == "__main__":
+    # Example usage
+    files = [
+        STORAGE_FOLDER / "enhanced_cps_2024.h5",
+    ]
 
-    return df
\ No newline at end of file
+    for file in files:
+        output_path = file.with_name(file.stem + "_minimised.h5")
+        minimise_dataset(file, output_path, loss_rel_change_max=10)

From 05ee7e4075293057756d24da0e23b36a6cfe3465 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Thu, 10 Jul 2025 23:50:52 +0100
Subject: [PATCH 03/56] Try L0

---
 Makefile                                       |  1 -
 .../datasets/cps/enhanced_cps.py               | 18 +++++++++++++++++-
 policyengine_us_data/utils/minimise.py         |  4 +++-
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 90b2817a..788ba1d3 100644
--- a/Makefile
+++ b/Makefile
@@ -46,7 +46,6 @@ data:
 	python policyengine_us_data/datasets/cps/extended_cps.py
 	python policyengine_us_data/datasets/cps/enhanced_cps.py
 	python policyengine_us_data/datasets/cps/small_enhanced_cps.py
-	python policyengine_us_data/utils/minimise.py
 
 clean:
 	rm -f policyengine_us_data/storage/*.h5
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index b8af12ce..9e61414c 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -59,9 +59,25 @@ def loss(weights):
             ((estimate - targets_array) + 1) / (targets_array + 1)
         ) ** 2
         rel_error_normalized = rel_error * normalisation_factor
+
+        # L0 penalty (approximated with smooth function)
+        # Since L0 is non-differentiable, we use a smooth approximation
+        # Common approaches:
+        
+        # Option 1: Sigmoid approximation
+        epsilon = 1e-3  # Threshold for "near zero"
+        l0_penalty_weight = 1e-1  # Adjust this hyperparameter
+        smoothed_l0 = torch.sigmoid((weights - epsilon) / (epsilon * 0.1)).mean()
+        
+        # Option 2: Log-sum penalty (smoother)
+        # smoothed_l0 = torch.log(1 + actual_weights / epsilon).sum() / len(actual_weights)
+        
+        # Option 3: Exponential penalty
+        # smoothed_l0 = (1 - torch.exp(-actual_weights / epsilon)).mean()
+
         if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
-        return rel_error_normalized.mean()
+        return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
 
     def dropout_weights(weights, p):
         if p == 0:
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 6fe511fd..2b122fec 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -32,14 +32,16 @@ def create_calibration_log_file(file_path):
     df["rel_abs_error"] = df["abs_error"] / df["target"].abs()
     df["loss"] = (df["rel_error"] ** 2).mean()
 
-    df.to_csv(file_path.replace(".h5", "_calibration_log.csv"), index=False)
+    df.to_csv(str(file_path).replace(".h5", "_calibration_log.csv"), index=False)
 
 
 def minimise_dataset(
     dataset, output_path: str, loss_rel_change_max: float
 ) -> None:
+    dataset = str(dataset)
     create_calibration_log_file(dataset)
 
+    dataset = Dataset.from_file(dataset)
     loss_matrix = build_loss_matrix(dataset, 2024)
 
     sim = Microsimulation(dataset=dataset)

From e38c6479483c9b2fb0cca9939c881995267a10d7 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Thu, 10 Jul 2025 23:54:02 +0100
Subject: [PATCH 04/56] Format

---
 policyengine_us_data/datasets/cps/enhanced_cps.py | 10 ++++++----
 policyengine_us_data/utils/minimise.py            |  4 +++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 9e61414c..7d81a0c0 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -63,15 +63,17 @@ def loss(weights):
         # L0 penalty (approximated with smooth function)
         # Since L0 is non-differentiable, we use a smooth approximation
         # Common approaches:
-        
+
         # Option 1: Sigmoid approximation
         epsilon = 1e-3  # Threshold for "near zero"
         l0_penalty_weight = 1e-1  # Adjust this hyperparameter
-        smoothed_l0 = torch.sigmoid((weights - epsilon) / (epsilon * 0.1)).mean()
-        
+        smoothed_l0 = torch.sigmoid(
+            (weights - epsilon) / (epsilon * 0.1)
+        ).mean()
+
         # Option 2: Log-sum penalty (smoother)
         # smoothed_l0 = torch.log(1 + actual_weights / epsilon).sum() / len(actual_weights)
-        
+
         # Option 3: Exponential penalty
         # smoothed_l0 = (1 - torch.exp(-actual_weights / epsilon)).mean()
 
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 2b122fec..186a7673 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -32,7 +32,9 @@ def create_calibration_log_file(file_path):
     df["rel_abs_error"] = df["abs_error"] / df["target"].abs()
     df["loss"] = (df["rel_error"] ** 2).mean()
 
-    df.to_csv(str(file_path).replace(".h5", "_calibration_log.csv"), index=False)
+    df.to_csv(
+        str(file_path).replace(".h5", "_calibration_log.csv"), index=False
+    )
 
 
 def minimise_dataset(

From bdf3d6d89d16ac396786899ce3e3233c0c46ceb4 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 11 Jul 2025 16:24:22 +0200
Subject: [PATCH 05/56] attempting to vectorize minimizing of ecps

---
 changelog_entry.yaml                          |  4 +
 .../datasets/cps/enhanced_cps.py              | 27 +++---
 policyengine_us_data/utils/minimise.py        | 83 ++++++++++++++++---
 3 files changed, 91 insertions(+), 23 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e69de29b..84eeb584 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+    - Enhanced CPS minimizing tests.
\ No newline at end of file
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 7d81a0c0..bf303f7a 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -45,8 +45,8 @@ def reweight(
         np.log(original_weights), requires_grad=True, dtype=torch.float32
     )
 
-    # TODO: replace this with a call to the python reweight.py package.
-    def loss(weights):
+    # TO DO: replace this with a call to the python reweight.py package.
+    def loss(weights, penalty_approach="l0_sigmoid"):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -60,25 +60,32 @@ def loss(weights):
         ) ** 2
         rel_error_normalized = rel_error * normalisation_factor
 
+        if torch.isnan(rel_error_normalized).any():
+            raise ValueError("Relative error contains NaNs")
+
         # L0 penalty (approximated with smooth function)
         # Since L0 is non-differentiable, we use a smooth approximation
         # Common approaches:
 
-        # Option 1: Sigmoid approximation
         epsilon = 1e-3  # Threshold for "near zero"
         l0_penalty_weight = 1e-1  # Adjust this hyperparameter
-        smoothed_l0 = torch.sigmoid(
-            (weights - epsilon) / (epsilon * 0.1)
-        ).mean()
+
+        # Option 1: Sigmoid approximation
+        if penalty_approach == "l0_sigmoid":
+            smoothed_l0 = torch.sigmoid(
+                (weights - epsilon) / (epsilon * 0.1)
+            ).mean()
 
         # Option 2: Log-sum penalty (smoother)
-        # smoothed_l0 = torch.log(1 + actual_weights / epsilon).sum() / len(actual_weights)
+        if penalty_approach == "l0_log":
+            smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights)
 
         # Option 3: Exponential penalty
-        # smoothed_l0 = (1 - torch.exp(-actual_weights / epsilon)).mean()
+        if penalty_approach == "l0_exp":
+            smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
+
+        # L1 penalty
 
-        if torch.isnan(rel_error_normalized).any():
-            raise ValueError("Relative error contains NaNs")
         return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
 
     def dropout_weights(weights, p):
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 186a7673..94601d02 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import h5py
 from policyengine_us_data.storage import STORAGE_FOLDER
+from typing import Optional
 
 
 def create_calibration_log_file(file_path):
@@ -37,6 +38,57 @@ def create_calibration_log_file(file_path):
     )
 
 
+def losses_for_candidates(
+    base_weights: np.ndarray,
+    idxs: np.ndarray,
+    est_mat: np.ndarray,
+    targets: np.ndarray,
+    norm: np.ndarray,
+    chunk_size: Optional[int] = 25_000,
+) -> np.ndarray:
+    """
+    Return the loss value *for each* candidate deletion in `idxs`
+    in one matrix multiplication.
+
+    Parameters
+    ----------
+    base_weights : (n,) original weight vector
+    idxs         : (k,) candidate row indices to zero-out
+    est_mat      : (n, m) estimate matrix
+    targets      : (m,) calibration targets
+    norm         : (m,) normalisation factors
+    chunk_size   : max number of candidates to process at once
+
+    Returns
+    -------
+    losses       : (k,) loss if row i were removed (and weights rescaled)
+    """
+    W = base_weights
+    total = W.sum()
+    k = len(idxs)
+    losses = np.empty(k, dtype=float)
+
+    # Work through the candidate list in blocks
+    for start in range(0, k, chunk_size):
+        stop = min(start + chunk_size, k)
+        part = idxs[start:stop]  # (p,) where p ≤ chunk_size
+        p = len(part)
+
+        # Build the delta matrix only for this chunk
+        delta = np.zeros((p, len(W)))
+        delta[np.arange(p), part] = -W[part]
+
+        keep_total = total + delta.sum(axis=1)  # (p,)
+        delta *= (total / keep_total)[:, None]
+
+        # Matrix–matrix multiply → one matrix multiplication per chunk
+        ests = (W + delta) @ est_mat  # (p, m)
+        rel_err = ((ests - targets) + 1) / (targets + 1)
+        losses[start:stop] = ((rel_err * norm) ** 2).mean(axis=1)
+
+    return losses
+
+
 def minimise_dataset(
     dataset, output_path: str, loss_rel_change_max: float
 ) -> None:
@@ -95,19 +147,24 @@ def get_loss_from_mask(
             size=int(len(weights) * VIEW_FRACTION_PER_ITERATION),
             replace=False,
         )
-        for household_index in tqdm(indices):
-            # Skip if this household is already excluded
-            if not inclusion_mask[household_index]:
-                household_loss_rel_changes.append(np.inf)
-                continue
-            # Calculate loss if this household is removed
-            inclusion_mask = inclusion_mask.copy()
-            inclusion_mask[household_index] = False
-            loss = get_loss_from_mask(
-                inclusion_mask, estimate_matrix, targets, normalisation_factor
-            )
-            rel_change = (loss - baseline_loss) / baseline_loss
-            household_loss_rel_changes.append(rel_change)
+
+        # more efficient approach to compute losses for candidate households to be removed
+
+        # 1. sample only households that are currently *included*
+        indices = np.random.choice(
+            np.where(full_mask)[0],
+            size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION),
+            replace=False,
+        )
+        # 2. compute losses for the batch in one shot
+        candidate_losses = losses_for_candidates(
+            weights, indices, estimate_matrix, targets, normalisation_factor
+        )
+        # 3. convert to relative change vs. baseline
+        household_loss_rel_changes = (
+            candidate_losses - baseline_loss
+        ) / baseline_loss
+
         inclusion_mask = full_mask.copy()
         household_loss_rel_changes = np.array(household_loss_rel_changes)
         # Sort by the relative change in loss

From 03e5d0d380494b698cbcb4af14b5c8eb256754d0 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 11 Jul 2025 16:52:43 +0200
Subject: [PATCH 06/56] adding random sampling minimization strategy

---
 policyengine_us_data/utils/minimise.py | 240 ++++++++++++++++++-------
 1 file changed, 173 insertions(+), 67 deletions(-)

diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 94601d02..45212905 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import h5py
 from policyengine_us_data.storage import STORAGE_FOLDER
-from typing import Optional
+from typing import Optional, Callable
 
 
 def create_calibration_log_file(file_path):
@@ -89,116 +89,214 @@ def losses_for_candidates(
     return losses
 
 
-def minimise_dataset(
-    dataset, output_path: str, loss_rel_change_max: float
-) -> None:
-    dataset = str(dataset)
-    create_calibration_log_file(dataset)
+def get_loss_from_mask(
+    weights, inclusion_mask, estimate_matrix, targets, normalisation_factor
+):
+    """
+    Calculate the loss based on the inclusion mask and the estimate matrix.
+    """
+    masked_weights = weights.copy()
+    original_weight_total = masked_weights.sum()
+    if (~inclusion_mask).sum() > 0:
+        masked_weights[~inclusion_mask] = 0
+    masked_weight_total = masked_weights.sum()
+    masked_weights[inclusion_mask] *= (
+        original_weight_total / masked_weight_total
+    )
+    estimates = masked_weights @ estimate_matrix
+    rel_error = ((estimates - targets) + 1) / (targets + 1)
+    loss = ((rel_error * normalisation_factor) ** 2).mean()
 
-    dataset = Dataset.from_file(dataset)
-    loss_matrix = build_loss_matrix(dataset, 2024)
+    return loss
 
-    sim = Microsimulation(dataset=dataset)
 
-    weights = sim.calculate("household_weight", 2024).values
-    estimate_matrix, targets = loss_matrix
-    is_national = estimate_matrix.columns.str.startswith("nation/")
-    nation_normalisation_factor = is_national * (1 / is_national.sum())
-    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
-    normalisation_factor = np.where(
-        is_national, nation_normalisation_factor, state_normalisation_factor
-    )
-    weights @ estimate_matrix
-
-    def get_loss_from_mask(
-        inclusion_mask, estimate_matrix, targets, normalisation_factor
-    ):
-        """
-        Calculate the loss based on the inclusion mask and the estimate matrix.
-        """
-        masked_weights = weights.copy()
-        original_weight_total = masked_weights.sum()
-        if (~inclusion_mask).sum() > 0:
-            masked_weights[~inclusion_mask] = 0
-        masked_weight_total = masked_weights.sum()
-        masked_weights[inclusion_mask] *= (
-            original_weight_total / masked_weight_total
-        )
-        estimates = masked_weights @ estimate_matrix
-        rel_error = ((estimates - targets) + 1) / (targets + 1)
-        loss = ((rel_error * normalisation_factor) ** 2).mean()
+def candidate_loss_contribution(
+    weights: np.ndarray,
+    estimate_matrix: np.ndarray,
+    targets: np.ndarray,
+    normalisation_factor: np.ndarray,
+    loss_rel_change_max: float,
+    count_iterations: int = 5,
+    view_fraction_per_iteration: float = 0.3,
+    fraction_remove_per_iteration: float = 0.1,
+) -> np.ndarray:
+    """
+    Minimization approach based on candidate loss contribution.
+
+    This function iteratively removes households that contribute least to the loss,
+    maintaining the calibration quality within the specified tolerance.
 
-        return loss
+    Parameters
+    ----------
+    weights : (n,) household weights
+    estimate_matrix : (n, m) matrix mapping weights to estimates
+    targets : (m,) calibration targets
+    normalisation_factor : (m,) normalisation factors for different targets
+    loss_rel_change_max : maximum allowed relative change in loss
+    count_iterations : number of iterations to perform
+    view_fraction_per_iteration : fraction of households to evaluate each iteration
+    fraction_remove_per_iteration : fraction of households to remove each iteration
 
-    COUNT_ITERATIONS = 5
-    VIEW_FRACTION_PER_ITERATION = 0.3
-    FRACTION_REMOVE_PER_ITERATION = 0.1
+    Returns
+    -------
+    inclusion_mask : (n,) boolean mask of households to keep
+    """
     from tqdm import tqdm
 
     full_mask = np.ones_like(weights, dtype=bool)
-    for i in range(COUNT_ITERATIONS):
+
+    for i in range(count_iterations):
         inclusion_mask = full_mask.copy()
         baseline_loss = get_loss_from_mask(
-            inclusion_mask, estimate_matrix, targets, normalisation_factor
+            weights,
+            inclusion_mask,
+            estimate_matrix,
+            targets,
+            normalisation_factor,
         )
-        household_loss_rel_changes = []
-        indices = np.random.choice(
-            np.arange(len(weights)),
-            size=int(len(weights) * VIEW_FRACTION_PER_ITERATION),
-            replace=False,
-        )
-
-        # more efficient approach to compute losses for candidate households to be removed
 
-        # 1. sample only households that are currently *included*
+        # Sample only households that are currently included
         indices = np.random.choice(
             np.where(full_mask)[0],
-            size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION),
+            size=int(full_mask.sum() * view_fraction_per_iteration),
             replace=False,
         )
-        # 2. compute losses for the batch in one shot
+
+        # Compute losses for the batch in one shot
         candidate_losses = losses_for_candidates(
             weights, indices, estimate_matrix, targets, normalisation_factor
         )
-        # 3. convert to relative change vs. baseline
+
+        # Convert to relative change vs. baseline
         household_loss_rel_changes = (
             candidate_losses - baseline_loss
         ) / baseline_loss
 
-        inclusion_mask = full_mask.copy()
-        household_loss_rel_changes = np.array(household_loss_rel_changes)
         # Sort by the relative change in loss
         sorted_indices = np.argsort(household_loss_rel_changes)
+
         # Remove the worst households
-        num_to_remove = int(len(weights) * FRACTION_REMOVE_PER_ITERATION)
-        worst_indices = sorted_indices[:num_to_remove]
+        num_to_remove = int(len(weights) * fraction_remove_per_iteration)
+        worst_indices = indices[sorted_indices[:num_to_remove]]
         inclusion_mask[worst_indices] = False
+
         # Calculate the new loss
         new_loss = get_loss_from_mask(
-            inclusion_mask, estimate_matrix, targets, normalisation_factor
+            weights,
+            inclusion_mask,
+            estimate_matrix,
+            targets,
+            normalisation_factor,
         )
         rel_change = (new_loss - baseline_loss) / baseline_loss
+
         if rel_change > loss_rel_change_max:
             print(
-                f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, which is too high ({rel_change:.2%}). Stopping."
+                f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, "
+                f"which is too high ({rel_change:.2%}). Stopping."
             )
             break
+
         print(
             f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}"
         )
         print(
             f"Removed {num_to_remove} households with worst relative loss changes."
         )
+
         # Update the full mask
         full_mask &= inclusion_mask
 
-    household_ids = sim.calculate("household_id", 2024).values
-    remaining_households = household_ids[full_mask]
+    return full_mask
+
+
+def random_sampling_minimization(
+    weights,
+    estimate_matrix,
+    targets,
+    normalisation_factor,
+    target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5],
+):
+    """A simple random sampling approach"""
+    n = len(weights)
+
+    final_mask = None
+    lowest_loss = float("inf")
+    for fraction in target_fractions:
+        target_size = int(n * fraction)
+        # Random sampling with multiple attempts
+        best_mask = None
+        best_loss = float("inf")
+
+        for _ in range(5):  # Try 5 random samples
+            mask = np.zeros(n, dtype=bool)
+            mask[np.random.choice(n, target_size, replace=False)] = True
+
+            loss = get_loss_from_mask(
+                weights, mask, estimate_matrix, targets, normalisation_factor
+            )
+
+            if loss < best_loss:
+                best_loss = loss
+                best_mask = mask
+
+        if lowest_loss > best_loss:
+            lowest_loss = best_loss
+            final_mask = best_mask
+
+    return final_mask
+
+
+def minimise_dataset(
+    dataset,
+    output_path: str,
+    loss_rel_change_max: float,
+    minimization_function: Callable = candidate_loss_contribution,
+    **kwargs,
+) -> None:
+    """
+    Main function to minimize a dataset using a specified minimization approach.
+
+    Parameters
+    ----------
+    dataset : path to the dataset file or Dataset object
+    output_path : path where the minimized dataset will be saved
+    loss_rel_change_max : maximum allowed relative change in loss
+    minimization_function : function that implements the minimization logic
+    **kwargs : additional arguments to pass to the minimization function
+    """
+    dataset = str(dataset)
+    create_calibration_log_file(dataset)
+
+    dataset = Dataset.from_file(dataset)
+    loss_matrix = build_loss_matrix(dataset, 2024)
 
-    # At this point we have a mask of households to keep
+    sim = Microsimulation(dataset=dataset)
 
-    # I'm saving to a csv for ease of debugging, but we need to save to a .h5 file
+    weights = sim.calculate("household_weight", 2024).values
+    estimate_matrix, targets = loss_matrix
+    is_national = estimate_matrix.columns.str.startswith("nation/")
+    nation_normalisation_factor = is_national * (1 / is_national.sum())
+    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
+    normalisation_factor = np.where(
+        is_national, nation_normalisation_factor, state_normalisation_factor
+    )
+
+    # Call the minimization function
+    inclusion_mask = minimization_function(
+        weights=weights,
+        estimate_matrix=estimate_matrix,
+        targets=targets,
+        normalisation_factor=normalisation_factor,
+        loss_rel_change_max=loss_rel_change_max,
+        **kwargs,
+    )
+
+    # Extract household IDs for remaining households
+    household_ids = sim.calculate("household_id", 2024).values
+    remaining_households = household_ids[inclusion_mask]
 
+    # Create a smaller dataset with only the remaining households
     df = sim.to_input_dataframe()
     smaller_df = df[df["household_id__2024"].isin(remaining_households)]
 
@@ -208,27 +306,30 @@ def get_loss_from_mask(
     )
     print(f"Weight relative change: {weight_rel_change:.2%}")
 
+    # Create new simulation with smaller dataset
     sim = Microsimulation(dataset=smaller_df)
 
+    # Rescale weights to maintain total
     sim.set_input(
         "household_weight",
         2024,
         sim.calculate("household_weight", 2024).values / weight_rel_change,
     )
 
+    # Prepare data for saving
     data = {}
-
     for variable in sim.input_variables:
         data[variable] = {2024: sim.calculate(variable, 2024).values}
         if data[variable][2024].dtype == "object":
             data[variable][2024] = data[variable][2024].astype("S")
 
+    # Save to HDF5 file
     with h5py.File(output_path, "w") as f:
         for variable, values in data.items():
             for year, value in values.items():
                 f.create_dataset(f"{variable}/{year}", data=value)
-    print(f"Saved minimised dataset to {output_path}")
 
+    print(f"Saved minimised dataset to {output_path}")
     create_calibration_log_file(output_path)
 
 
@@ -240,4 +341,9 @@ def get_loss_from_mask(
 
     for file in files:
         output_path = file.with_name(file.stem + "_minimised.h5")
-        minimise_dataset(file, output_path, loss_rel_change_max=10)
+        minimise_dataset(
+            file,
+            output_path,
+            loss_rel_change_max=10,
+            minimization_function=candidate_loss_contribution,
+        )

From cd0776c0eb7d1745e987ace34ecc4b56306eee2b Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 11 Jul 2025 19:19:58 +0200
Subject: [PATCH 07/56] add notebook with testing functionality (havent tested
 locally)

---
 .../datasets/cps/enhanced_cps.py              |   8 +-
 policyengine_us_data/utils/minimise.py        |   2 +-
 test_minimization_approach.ipynb              | 107 ++++++++++++++++++
 3 files changed, 115 insertions(+), 2 deletions(-)
 create mode 100644 test_minimization_approach.ipynb

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index bf303f7a..08798622 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -28,6 +28,7 @@ def reweight(
     targets_array,
     dropout_rate=0.05,
     log_path="calibration_log.csv",
+    penalty_approach="l0_sigmoid",
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -46,7 +47,7 @@ def reweight(
     )
 
     # TO DO: replace this with a call to the python reweight.py package.
-    def loss(weights, penalty_approach="l0_sigmoid"):
+    def loss(weights, penalty_approach=penalty_approach):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -85,6 +86,11 @@ def loss(weights, penalty_approach="l0_sigmoid"):
             smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
 
         # L1 penalty
+        l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
+
+        if penalty_approach == "l1":
+            l1 = torch.mean(weights)
+            return rel_error_normalized.mean() + l1_penalty_weight * l1
 
         return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
 
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 45212905..a9ba3959 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -330,7 +330,7 @@ def minimise_dataset(
                 f.create_dataset(f"{variable}/{year}", data=value)
 
     print(f"Saved minimised dataset to {output_path}")
-    create_calibration_log_file(output_path)
+    create_calibration_log_file(output_path)    
 
 
 if __name__ == "__main__":
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
new file mode 100644
index 00000000..519d2725
--- /dev/null
+++ b/test_minimization_approach.ipynb
@@ -0,0 +1,107 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d6dc9cca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n",
+    "from policyengine_us_data.storage import STORAGE_FOLDER\n",
+    "from policyengine_us import Microsimulation\n",
+    "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
+    "from policyengine_us_data.utils import build_loss_matrix\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db975ac1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
+    "\n",
+    "files = [\n",
+    "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
+    "    ]\n",
+    "\n",
+    "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n",
+    "minimization_function = random_sampling_minimization\n",
+    "# other minimization function approach is \"candidate_loss_contribution\"\n",
+    "\n",
+    "for file in files:\n",
+    "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "    minimise_dataset(\n",
+    "        file,\n",
+    "        output_path,\n",
+    "        loss_rel_change_max=10,\n",
+    "        minimization_function=minimization_function, \n",
+    "        target_fractions=[0.5] # remove if switching approach\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35892c9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
+    "\n",
+    "input_dataset = ExtendedCPS_2024\n",
+    "\n",
+    "approach = \"l0_sigmoid\"\n",
+    "# other options are \"l0_log\", \"l0_exp\", \"l1\"\n",
+    "\n",
+    "sim = Microsimulation(dataset=input_dataset)\n",
+    "data = sim.dataset.load_dataset()\n",
+    "data[\"household_weight\"] = {}\n",
+    "original_weights = sim.calculate(\"household_weight\")\n",
+    "original_weights = original_weights.values + np.random.normal(\n",
+    "    1, 0.1, len(original_weights)\n",
+    ")\n",
+    "for year in range(2024, 2025):\n",
+    "    loss_matrix, targets_array = build_loss_matrix(\n",
+    "        input_dataset, year\n",
+    "    )\n",
+    "    optimised_weights = reweight(\n",
+    "        original_weights,\n",
+    "        loss_matrix,\n",
+    "        targets_array,\n",
+    "        log_path= STORAGE_FOLDER / approach / \"calibration_log.csv\",\n",
+    "        penalty_approach=approach,\n",
+    "    )\n",
+    "    data[\"household_weight\"][year] = optimised_weights\n",
+    "\n",
+    "output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "\n",
+    "data.save_dataset(output_path)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pe",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 2c050fc973ba312d070c27dcb7f1fb049e1e2af2 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 11 Jul 2025 19:20:55 +0200
Subject: [PATCH 08/56] lint

---
 policyengine_us_data/utils/minimise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index a9ba3959..45212905 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -330,7 +330,7 @@ def minimise_dataset(
                 f.create_dataset(f"{variable}/{year}", data=value)
 
     print(f"Saved minimised dataset to {output_path}")
-    create_calibration_log_file(output_path)    
+    create_calibration_log_file(output_path)
 
 
 if __name__ == "__main__":

From ee98fc36ab920d571982862dc48d950b7a58ec3d Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Fri, 11 Jul 2025 20:06:32 -0400
Subject: [PATCH 09/56] debugged 2nd cell: created path & removed optional
 parameters.

---
 policyengine_us_data/utils/minimise.py |   8 +-
 test_minimization_approach.ipynb       | 219 +++++++++++++++++++++++--
 2 files changed, 210 insertions(+), 17 deletions(-)

diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 45212905..e84e1bee 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -250,10 +250,11 @@ def random_sampling_minimization(
 def minimise_dataset(
     dataset,
     output_path: str,
-    loss_rel_change_max: float,
     minimization_function: Callable = candidate_loss_contribution,
     **kwargs,
 ) -> None:
+    #loss_rel_change_max = kwargs.pop('loss_rel_change_max', 10.0)
+
     """
     Main function to minimize a dataset using a specified minimization approach.
 
@@ -288,8 +289,7 @@ def minimise_dataset(
         estimate_matrix=estimate_matrix,
         targets=targets,
         normalisation_factor=normalisation_factor,
-        loss_rel_change_max=loss_rel_change_max,
-        **kwargs,
+        **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
     )
 
     # Extract household IDs for remaining households
@@ -344,6 +344,4 @@ def minimise_dataset(
         minimise_dataset(
             file,
             output_path,
-            loss_rel_change_max=10,
-            minimization_function=candidate_loss_contribution,
         )
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 519d2725..8400d4fe 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -12,15 +12,188 @@
     "from policyengine_us import Microsimulation\n",
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "import os\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "db975ac1",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n",
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n",
+      "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Weight relative change: 52.19%\n",
+      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n"
+     ]
+    }
+   ],
    "source": [
     "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
     "\n",
@@ -28,27 +201,49 @@
     "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
     "    ]\n",
     "\n",
-    "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n",
-    "minimization_function = random_sampling_minimization\n",
-    "# other minimization function approach is \"candidate_loss_contribution\"\n",
+    "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n",
+    "minimization_function = candidate_loss_contribution\n",
+    "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n",
     "\n",
     "for file in files:\n",
     "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
     "    minimise_dataset(\n",
     "        file,\n",
     "        output_path,\n",
     "        loss_rel_change_max=10,\n",
     "        minimization_function=minimization_function, \n",
-    "        target_fractions=[0.5] # remove if switching approach\n",
+    "        #target_fractions=[0.5] # remove if switching approach\n",
     "    )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "35892c9d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[5], line 8\u001b[0m\n\u001b[1;32m      5\u001b[0m approach \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ml0_sigmoid\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m# other options are \"l0_log\", \"l0_exp\", \"l1\"\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_dataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      9\u001b[0m data \u001b[38;5;241m=\u001b[39m sim\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39mload_dataset()\n\u001b[1;32m     10\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {}\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m    215\u001b[0m     dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    216\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m    217\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m    218\u001b[0m ):\n\u001b[1;32m    219\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m    224\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m    225\u001b[0m )\n\u001b[1;32m    226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m    180\u001b[0m         dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m    181\u001b[0m             dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    182\u001b[0m         )\n\u001b[1;32m    183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m    186\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m    187\u001b[0m         dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    188\u001b[0m     )\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:103\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m    101\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload()\n\u001b[1;32m    102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 103\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/extended_cps.py:147\u001b[0m, in \u001b[0;36mExtendedCPS.generate\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    144\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Microsimulation\n\u001b[1;32m    146\u001b[0m cps_sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcps)\n\u001b[0;32m--> 147\u001b[0m puf_sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpuf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    149\u001b[0m puf_sim\u001b[38;5;241m.\u001b[39msubsample(\u001b[38;5;241m10_000\u001b[39m)\n\u001b[1;32m    151\u001b[0m INPUTS \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    152\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mage\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    153\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_male\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    158\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_tax_unit_dependent\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    159\u001b[0m ]\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m    215\u001b[0m     dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    216\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m    217\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m    218\u001b[0m ):\n\u001b[1;32m    219\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m    224\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m    225\u001b[0m )\n\u001b[1;32m    226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m    180\u001b[0m         dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m    181\u001b[0m             dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    182\u001b[0m         )\n\u001b[1;32m    183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m    186\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m    187\u001b[0m         dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    188\u001b[0m     )\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:101\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m     99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexists \u001b[38;5;129;01mand\u001b[39;00m require:\n\u001b[1;32m    100\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39murl \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    102\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    103\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgenerate()\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:343\u001b[0m, in \u001b[0;36mDataset.download\u001b[0;34m(self, url, version)\u001b[0m\n\u001b[1;32m    341\u001b[0m response \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mget(url, headers\u001b[38;5;241m=\u001b[39mauth_headers)\n\u001b[1;32m    342\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[0;32m--> 343\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    344\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid response code \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for url \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    345\u001b[0m     )\n\u001b[1;32m    346\u001b[0m assets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massets\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m    347\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m asset \u001b[38;5;129;01min\u001b[39;00m assets:\n",
+      "\u001b[0;31mValueError\u001b[0m: Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0."
+     ]
+    }
+   ],
    "source": [
     "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
     "\n",
@@ -85,7 +280,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "pe",
+   "display_name": "policyengine-us-data",
    "language": "python",
    "name": "python3"
   },
@@ -99,7 +294,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.11"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,

From f6d7f0fa00f158f099c2dc15116fac4987d33085 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Mon, 14 Jul 2025 15:22:58 +0200
Subject: [PATCH 10/56] few updates to the testing framework

---
 changelog_entry.yaml                          |  2 +-
 .../datasets/cps/enhanced_cps.py              | 78 +++++++++++++------
 policyengine_us_data/utils/minimise.py        | 75 +++++++++++++-----
 pyproject.toml                                |  4 +-
 test_minimization_approach.ipynb              | 75 +++++++++---------
 5 files changed, 149 insertions(+), 85 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 84eeb584..ac664753 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,4 +1,4 @@
 - bump: minor
   changes:
     added:
-    - Enhanced CPS minimizing tests.
\ No newline at end of file
+    - Enhanced CPS minimizing tests. 
\ No newline at end of file
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 08798622..6ad510f3 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -22,13 +22,25 @@
     torch = None
 
 
+bad_targets = [
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+]
+
+
 def reweight(
     original_weights,
     loss_matrix,
     targets_array,
     dropout_rate=0.05,
     log_path="calibration_log.csv",
-    penalty_approach="l0_sigmoid",
+    penalty_approach=None,
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -64,35 +76,43 @@ def loss(weights, penalty_approach=penalty_approach):
         if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
 
-        # L0 penalty (approximated with smooth function)
-        # Since L0 is non-differentiable, we use a smooth approximation
-        # Common approaches:
+        if penalty_approach is not None:
+            # L0 penalty (approximated with smooth function)
+            # Since L0 is non-differentiable, we use a smooth approximation
+            # Common approaches:
+
+            epsilon = 1e-3  # Threshold for "near zero"
+            l0_penalty_weight = 1e-1  # Adjust this hyperparameter
 
-        epsilon = 1e-3  # Threshold for "near zero"
-        l0_penalty_weight = 1e-1  # Adjust this hyperparameter
+            # Option 1: Sigmoid approximation
+            if penalty_approach == "l0_sigmoid":
+                smoothed_l0 = torch.sigmoid(
+                    (weights - epsilon) / (epsilon * 0.1)
+                ).mean()
 
-        # Option 1: Sigmoid approximation
-        if penalty_approach == "l0_sigmoid":
-            smoothed_l0 = torch.sigmoid(
-                (weights - epsilon) / (epsilon * 0.1)
-            ).mean()
+            # Option 2: Log-sum penalty (smoother)
+            if penalty_approach == "l0_log":
+                smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(
+                    weights
+                )
 
-        # Option 2: Log-sum penalty (smoother)
-        if penalty_approach == "l0_log":
-            smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights)
+            # Option 3: Exponential penalty
+            if penalty_approach == "l0_exp":
+                smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
 
-        # Option 3: Exponential penalty
-        if penalty_approach == "l0_exp":
-            smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
+            # L1 penalty
+            l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
 
-        # L1 penalty
-        l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
+            if penalty_approach == "l1":
+                l1 = torch.mean(weights)
+                return rel_error_normalized.mean() + l1_penalty_weight * l1
 
-        if penalty_approach == "l1":
-            l1 = torch.mean(weights)
-            return rel_error_normalized.mean() + l1_penalty_weight * l1
+            return (
+                rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
+            )
 
-        return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
+        else:
+            return rel_error_normalized.mean()
 
     def dropout_weights(weights, p):
         if p == 0:
@@ -213,10 +233,18 @@ def generate(self):
             loss_matrix, targets_array = build_loss_matrix(
                 self.input_dataset, year
             )
+
+            bad_mask = loss_matrix.columns.isin(bad_targets)
+            keep_mask_bool = ~bad_mask
+            keep_idx = np.where(keep_mask_bool)[0]
+            loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+            targets_array_clean = targets_array[keep_idx]
+            assert loss_matrix_clean.shape[1] == targets_array_clean.size
+
             optimised_weights = reweight(
                 original_weights,
-                loss_matrix,
-                targets_array,
+                loss_matrix_clean,
+                targets_array_clean,
                 log_path="calibration_log.csv",
             )
             data["household_weight"][year] = optimised_weights
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index e84e1bee..df193c6e 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -7,30 +7,53 @@
 from policyengine_us_data.storage import STORAGE_FOLDER
 from typing import Optional, Callable
 
-
-def create_calibration_log_file(file_path):
+bad_targets = [
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+]
+
+
+def create_calibration_log_file(file_path, epoch=0):
     dataset = Dataset.from_file(file_path)
 
-    loss_matrix = build_loss_matrix(dataset, 2024)
+    loss_matrix, targets = build_loss_matrix(dataset, 2024)
+
+    bad_mask = loss_matrix.columns.isin(bad_targets)
+    keep_mask_bool = ~bad_mask
+    keep_idx = np.where(keep_mask_bool)[0]
+    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+    targets_clean = targets[keep_idx]
+    assert loss_matrix_clean.shape[1] == targets_clean.size
 
     sim = Microsimulation(dataset=dataset)
 
-    estimates = sim.calculate("household_weight", 2024).values @ loss_matrix[0]
-    target_names = loss_matrix[0].columns
-    target_values = loss_matrix[1]
+    estimates = (
+        sim.calculate("household_weight", 2024).values @ loss_matrix_clean
+    )
+    target_names = loss_matrix_clean.columns
 
     df = pd.DataFrame(
         {
             "target_name": target_names,
             "estimate": estimates,
-            "target": target_values,
+            "target": targets_clean,
         }
     )
-    df["epoch"] = 0
+    df["epoch"] = epoch
     df["error"] = df["estimate"] - df["target"]
     df["rel_error"] = df["error"] / df["target"]
     df["abs_error"] = df["error"].abs()
-    df["rel_abs_error"] = df["abs_error"] / df["target"].abs()
+    df["rel_abs_error"] = (
+        df["abs_error"] / df["target"].abs()
+        if df["target"].abs().sum() > 0
+        else np.nan
+    )
     df["loss"] = (df["rel_error"] ** 2).mean()
 
     df.to_csv(
@@ -215,11 +238,14 @@ def random_sampling_minimization(
     estimate_matrix,
     targets,
     normalisation_factor,
+    random=True,
     target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5],
 ):
     """A simple random sampling approach"""
     n = len(weights)
 
+    household_weights_normalized = weights / weights.sum()
+
     final_mask = None
     lowest_loss = float("inf")
     for fraction in target_fractions:
@@ -230,7 +256,14 @@ def random_sampling_minimization(
 
         for _ in range(5):  # Try 5 random samples
             mask = np.zeros(n, dtype=bool)
-            mask[np.random.choice(n, target_size, replace=False)] = True
+            mask[
+                np.random.choice(
+                    n,
+                    target_size,
+                    p=household_weights_normalized if random else None,
+                    replace=False,
+                )
+            ] = True
 
             loss = get_loss_from_mask(
                 weights, mask, estimate_matrix, targets, normalisation_factor
@@ -253,8 +286,6 @@ def minimise_dataset(
     minimization_function: Callable = candidate_loss_contribution,
     **kwargs,
 ) -> None:
-    #loss_rel_change_max = kwargs.pop('loss_rel_change_max', 10.0)
-
     """
     Main function to minimize a dataset using a specified minimization approach.
 
@@ -270,13 +301,19 @@ def minimise_dataset(
     create_calibration_log_file(dataset)
 
     dataset = Dataset.from_file(dataset)
-    loss_matrix = build_loss_matrix(dataset, 2024)
+    loss_matrix, targets = build_loss_matrix(dataset, 2024)
+
+    bad_mask = loss_matrix.columns.isin(bad_targets)
+    keep_mask_bool = ~bad_mask
+    keep_idx = np.where(keep_mask_bool)[0]
+    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+    targets_clean = targets[keep_idx]
+    assert loss_matrix_clean.shape[1] == targets_clean.size
 
     sim = Microsimulation(dataset=dataset)
 
     weights = sim.calculate("household_weight", 2024).values
-    estimate_matrix, targets = loss_matrix
-    is_national = estimate_matrix.columns.str.startswith("nation/")
+    is_national = loss_matrix_clean.columns.str.startswith("nation/")
     nation_normalisation_factor = is_national * (1 / is_national.sum())
     state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
     normalisation_factor = np.where(
@@ -286,10 +323,10 @@ def minimise_dataset(
     # Call the minimization function
     inclusion_mask = minimization_function(
         weights=weights,
-        estimate_matrix=estimate_matrix,
-        targets=targets,
+        estimate_matrix=loss_matrix_clean,
+        targets=targets_clean,
         normalisation_factor=normalisation_factor,
-        **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
+        **kwargs,  # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
     )
 
     # Extract household IDs for remaining households
@@ -330,7 +367,7 @@ def minimise_dataset(
                 f.create_dataset(f"{variable}/{year}", data=value)
 
     print(f"Saved minimised dataset to {output_path}")
-    create_calibration_log_file(output_path)
+    create_calibration_log_file(output_path, epoch=500)
 
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
index 0352db69..65d1ca8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,8 +17,8 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us>=1.197.0",
-    "policyengine-core>=3.14.1",
+    "policyengine-us>=1.340.0",
+    "policyengine-core>=3.17.1",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 8400d4fe..54f3c6fa 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -13,12 +13,24 @@
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
     "import numpy as np\n",
-    "import os\n"
+    "import os"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
+   "id": "6daabe7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Original ECPS 2024 dataset size (for household entity): 41310\n",
+    "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n",
+    "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "id": "db975ac1",
    "metadata": {},
    "outputs": [
@@ -128,18 +140,17 @@
       "Targeting Medicaid enrollment for WI with target 1108320k\n",
       "Targeting Medicaid enrollment for WV with target 467632k\n",
       "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n",
+      "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n",
+      "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n",
+      "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n",
+      "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Weight relative change: 52.19%\n",
-      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
+      "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n",
+      "Weight relative change: 99.10%\n",
+      "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
       "Targeting Medicaid enrollment for AK with target 231577k\n",
       "Targeting Medicaid enrollment for AL with target 766009k\n",
       "Targeting Medicaid enrollment for AR with target 733561k\n",
@@ -203,7 +214,7 @@
     "\n",
     "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n",
     "minimization_function = candidate_loss_contribution\n",
-    "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n",
+    "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
     "\n",
     "for file in files:\n",
     "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
@@ -212,38 +223,18 @@
     "    minimise_dataset(\n",
     "        file,\n",
     "        output_path,\n",
-    "        loss_rel_change_max=10,\n",
     "        minimization_function=minimization_function, \n",
-    "        #target_fractions=[0.5] # remove if switching approach\n",
+    "        # target_fractions=[0.5] # remove if switching approach\n",
+    "        loss_rel_change_max=0.0001,  # remove if switching approach\n",
     "    )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "35892c9d",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "ValueError",
-     "evalue": "Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0.",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[5], line 8\u001b[0m\n\u001b[1;32m      5\u001b[0m approach \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ml0_sigmoid\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m# other options are \"l0_log\", \"l0_exp\", \"l1\"\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_dataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      9\u001b[0m data \u001b[38;5;241m=\u001b[39m sim\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39mload_dataset()\n\u001b[1;32m     10\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {}\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m    215\u001b[0m     dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    216\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m    217\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m    218\u001b[0m ):\n\u001b[1;32m    219\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m    224\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m    225\u001b[0m )\n\u001b[1;32m    226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m    180\u001b[0m         dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m    181\u001b[0m             dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    182\u001b[0m         )\n\u001b[1;32m    183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m    186\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m    187\u001b[0m         dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    188\u001b[0m     )\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:103\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m    101\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload()\n\u001b[1;32m    102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 103\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/extended_cps.py:147\u001b[0m, in \u001b[0;36mExtendedCPS.generate\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    144\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Microsimulation\n\u001b[1;32m    146\u001b[0m cps_sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcps)\n\u001b[0;32m--> 147\u001b[0m puf_sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpuf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    149\u001b[0m puf_sim\u001b[38;5;241m.\u001b[39msubsample(\u001b[38;5;241m10_000\u001b[39m)\n\u001b[1;32m    151\u001b[0m INPUTS \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    152\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mage\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    153\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_male\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    158\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_tax_unit_dependent\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    159\u001b[0m ]\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m    215\u001b[0m     dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    216\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m    217\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m    218\u001b[0m ):\n\u001b[1;32m    219\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m    224\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m    225\u001b[0m )\n\u001b[1;32m    226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m    180\u001b[0m         dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m    181\u001b[0m             dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    182\u001b[0m         )\n\u001b[1;32m    183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m    186\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m    187\u001b[0m         dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    188\u001b[0m     )\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:101\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m     99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexists \u001b[38;5;129;01mand\u001b[39;00m require:\n\u001b[1;32m    100\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39murl \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    102\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    103\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgenerate()\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:343\u001b[0m, in \u001b[0;36mDataset.download\u001b[0;34m(self, url, version)\u001b[0m\n\u001b[1;32m    341\u001b[0m response \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mget(url, headers\u001b[38;5;241m=\u001b[39mauth_headers)\n\u001b[1;32m    342\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[0;32m--> 343\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    344\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid response code \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for url \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    345\u001b[0m     )\n\u001b[1;32m    346\u001b[0m assets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massets\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m    347\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m asset \u001b[38;5;129;01min\u001b[39;00m assets:\n",
-      "\u001b[0;31mValueError\u001b[0m: Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0."
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
     "\n",
@@ -276,11 +267,19 @@
     "\n",
     "data.save_dataset(output_path)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4cf8e89",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "policyengine-us-data",
+   "display_name": "pe",
    "language": "python",
    "name": "python3"
   },
@@ -294,7 +293,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.13"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,

From a042a01f7826997d0ac99b330183b80cfee167df Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 13:44:20 -0400
Subject: [PATCH 11/56] added CPS_2023 to lite mode generation

---
 changelog_entry.yaml                     | 6 +++---
 policyengine_us_data/datasets/cps/cps.py | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index ac664753..dcce3f1a 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,4 +1,4 @@
-- bump: minor
+- bump: patch
   changes:
-    added:
-    - Enhanced CPS minimizing tests. 
\ No newline at end of file
+    changed:
+    - lite mode now builds CPS_2023 in addition to CPS_2024
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 3b976a31..fde981ba 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2006,6 +2006,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
 
 if __name__ == "__main__":
     if test_lite:
+        CPS_2023().generate()
         CPS_2024().generate()
     else:
         CPS_2021().generate()

From cabeb56c7a1fe926eaf4c5aa5ecd26f45df3043f Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 14:54:23 -0400
Subject: [PATCH 12/56] Fixed manual test

---
 .github/workflows/code_changes.yaml      | 1 +
 .github/workflows/manual_tests.yaml      | 2 +-
 policyengine_us_data/datasets/cps/cps.py | 1 +
 pyproject.toml                           | 4 ++--
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml
index 6b474227..edd804db 100644
--- a/.github/workflows/code_changes.yaml
+++ b/.github/workflows/code_changes.yaml
@@ -2,6 +2,7 @@
 
 name: Code changes
 on:
+  workflow_call:
   push:
     branches:
       - main
diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml
index a2daca18..fb13ba89 100644
--- a/.github/workflows/manual_tests.yaml
+++ b/.github/workflows/manual_tests.yaml
@@ -11,7 +11,7 @@ on:
 
 jobs:
   test:
-    uses: ./.github/workflows/pr_changelog.yaml
+    uses: ./.github/workflows/code_changes.yaml
     with:
       TEST_LITE: ${{ github.event.inputs.test_lite }}
     secrets: inherit
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index fde981ba..177f4707 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2008,6 +2008,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
+        print(2 + 2)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/pyproject.toml b/pyproject.toml
index 65d1ca8e..3490ff1b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,8 +17,8 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us>=1.340.0",
-    "policyengine-core>=3.17.1",
+    "policyengine-us>=1.333.0",
+    "policyengine-core>=3.14.1",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",

From 7b76afba9eb55c3d2588c1ba5c6683a48e3709f7 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 15:02:22 -0400
Subject: [PATCH 13/56] try again with locked version

---
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 pyproject.toml                           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 177f4707..09a594c3 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2008,7 +2008,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 2)
+        print(2 + 3)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/pyproject.toml b/pyproject.toml
index 3490ff1b..74af05bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us>=1.333.0",
+    "policyengine-us==1.333.0",
     "policyengine-core>=3.14.1",
     "requests",
     "tqdm",

From 4056df4762b5d5e98ff6da815eae8de1484a4c25 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 15:44:32 -0400
Subject: [PATCH 14/56] trying things

---
 policyengine_us_data/datasets/cps/cps.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 09a594c3..1edce6e9 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -100,9 +100,14 @@ def downsample(self, frac: float):
         original_dtypes = {
             key: original_data[key].dtype for key in original_data
         }
-
+        print("\n\nHERE IS THE PROBLEM-----")
+        print(f"frac is {frac}")
+        print(self)
+        print(Microsimulation)
         sim = Microsimulation(dataset=self)
-        sim.subsample(frac=frac)
+        print(sim)
+        print(sim.subsample)
+        #sim.subsample(frac=frac)
 
         for key in original_data:
             if key not in sim.tax_benefit_system.variables:

From 96c4c25b71b5e148059be66a28805ad41c8cc28b Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 15:45:47 -0400
Subject: [PATCH 15/56] lint

---
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 1edce6e9..30688719 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -107,7 +107,7 @@ def downsample(self, frac: float):
         sim = Microsimulation(dataset=self)
         print(sim)
         print(sim.subsample)
-        #sim.subsample(frac=frac)
+        # sim.subsample(frac=frac)
 
         for key in original_data:
             if key not in sim.tax_benefit_system.variables:

From e20c75c202531e72fd118107c40fa10a0cda6e79 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 16:05:26 -0400
Subject: [PATCH 16/56] trying 3.11.12

---
 policyengine_us_data/datasets/cps/cps.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 30688719..8219e915 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -107,7 +107,7 @@ def downsample(self, frac: float):
         sim = Microsimulation(dataset=self)
         print(sim)
         print(sim.subsample)
-        # sim.subsample(frac=frac)
+        sim.subsample(frac=frac)
 
         for key in original_data:
             if key not in sim.tax_benefit_system.variables:
@@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 3)
+        print(2 + 5)
     else:
         CPS_2021().generate()
         CPS_2022().generate()

From 776eda8ce513f7e1b845cb8212abd17301e46c73 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 16:10:26 -0400
Subject: [PATCH 17/56] now actually specifying py version

---
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 8219e915..a25aba26 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 5)
+        print(2 + 7)
     else:
         CPS_2021().generate()
         CPS_2022().generate()

From cd771794473e0bb1f5005e7d6c598d8c1bc2a112 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 16:33:21 -0400
Subject: [PATCH 18/56] pandas v

---
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 pyproject.toml                           | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index a25aba26..b3554604 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 7)
+        print(2 + 8)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/pyproject.toml b/pyproject.toml
index 74af05bf..6c767ede 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ requires-python = ">=3.11, <3.13.0"
 dependencies = [
     "policyengine-us==1.333.0",
     "policyengine-core>=3.14.1",
+    "pandas==2.3.1",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",

From d0ce44db56b066e4d370bc434fba08435f65e01f Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 16:47:12 -0400
Subject: [PATCH 19/56] small runner

---
 .github/workflows/pr_code_changes.yaml   | 2 +-
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 213d192f..385e5a4c 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -49,7 +49,7 @@ jobs:
         run: python -c "from policyengine_core.data import Dataset; print('Core import OK')"
 
   Test:
-      runs-on: larger-runner
+      runs-on: ubuntu-latest 
       needs: Lint
       env:
         HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index b3554604..027c2ef5 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 8)
+        print(2 + 0)
     else:
         CPS_2021().generate()
         CPS_2022().generate()

From eb96cd5f706b0b718c39e36fa4fd1854bb3e3b0d Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 16:53:57 -0400
Subject: [PATCH 20/56] trying everything

---
 .github/workflows/pr_code_changes.yaml   | 2 +-
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 pyproject.toml                           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 385e5a4c..02209591 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -63,7 +63,7 @@ jobs:
           - name: Set up Python
             uses: actions/setup-python@v2
             with:
-                python-version: '3.11'
+                python-version: '3.11.12'
           - name: Install package
             run: uv pip install -e .[dev] --system
 
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 027c2ef5..afbf223f 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 0)
+        print(2 + 9)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/pyproject.toml b/pyproject.toml
index 6c767ede..d87290a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,11 +15,11 @@ authors = [
     {name = "PolicyEngine", email = "hello@policyengine.org"},
 ]
 license = {file = "LICENSE"}
-requires-python = ">=3.11, <3.13.0"
+requires-python = ">=3.11, <3.11.13"
 dependencies = [
     "policyengine-us==1.333.0",
     "policyengine-core>=3.14.1",
-    "pandas==2.3.1",
+    "pandas==2.3.0",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",

From 59ff94e82cd4dbd0aba16b488fd0b8ec16ca5531 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 17:02:45 -0400
Subject: [PATCH 21/56] relaxing python version in pyproject.toml

---
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 pyproject.toml                           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index afbf223f..3173d4d6 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 9)
+        print(3 + 9)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/pyproject.toml b/pyproject.toml
index d87290a2..fe5fda52 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,7 @@ authors = [
     {name = "PolicyEngine", email = "hello@policyengine.org"},
 ]
 license = {file = "LICENSE"}
-requires-python = ">=3.11, <3.11.13"
+requires-python = ">=3.11, <3.13.0"
 dependencies = [
     "policyengine-us==1.333.0",
     "policyengine-core>=3.14.1",

From d3fa67bf98762b48c6fe2397275c1d0aac2ff77b Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 17:29:12 -0400
Subject: [PATCH 22/56] putting things back in order.

---
 policyengine_us_data/datasets/cps/cps.py | 7 -------
 pyproject.toml                           | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 3173d4d6..d9957cbb 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -100,13 +100,7 @@ def downsample(self, frac: float):
         original_dtypes = {
             key: original_data[key].dtype for key in original_data
         }
-        print("\n\nHERE IS THE PROBLEM-----")
-        print(f"frac is {frac}")
-        print(self)
-        print(Microsimulation)
         sim = Microsimulation(dataset=self)
-        print(sim)
-        print(sim.subsample)
         sim.subsample(frac=frac)
 
         for key in original_data:
@@ -2013,7 +2007,6 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(3 + 9)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/pyproject.toml b/pyproject.toml
index fe5fda52..4bec19eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us==1.333.0",
+    "policyengine-us==1.340.1",
     "policyengine-core>=3.14.1",
     "pandas==2.3.0",
     "requests",

From 273c48d7bc9db1d6f06fa859897b63c30d37b044 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com>
Date: Sat, 12 Jul 2025 13:01:15 +0100
Subject: [PATCH 23/56] Use normal runner in PR tests

---
 .github/workflows/pr_code_changes.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 02209591..c84a4b97 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -49,7 +49,7 @@ jobs:
         run: python -c "from policyengine_core.data import Dataset; print('Core import OK')"
 
   Test:
-      runs-on: ubuntu-latest 
+      runs-on: ubuntu-latest
       needs: Lint
       env:
         HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}

From 8c2fbda847e9945878afa4085476f56895c360f1 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sat, 12 Jul 2025 09:53:07 -0400
Subject: [PATCH 24/56] added the 3.11.12 pin

---
 .github/workflows/code_changes.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml
index edd804db..c2340d14 100644
--- a/.github/workflows/code_changes.yaml
+++ b/.github/workflows/code_changes.yaml
@@ -38,7 +38,7 @@ jobs:
           - name: Set up Python
             uses: actions/setup-python@v2
             with:
-                python-version: '3.11'
+                python-version: '3.11.12'
           - uses: "google-github-actions/auth@v2"
             with:
               workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider"

From edb09456bb8548b8b4eb94136122ab5a5b33586e Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 20:00:50 -0400
Subject: [PATCH 25/56] cps.py

---
 policyengine_us_data/datasets/cps/cps.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index d9957cbb..202f9c69 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2007,6 +2007,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
+        print(3)
     else:
         CPS_2021().generate()
         CPS_2022().generate()

From 994ac15a636b99f951e205ecb3a861e72cdc3472 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 20:32:26 -0400
Subject: [PATCH 26/56] adding diagnostics

---
 .../datasets/cps/enhanced_cps.py                | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 6ad510f3..17d3e862 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -249,6 +249,23 @@ def generate(self):
             )
             data["household_weight"][year] = optimised_weights
 
+        print("\n\n---reweighting quick diagnostics----\n")
+        estimate = optimised_weights @ loss_matrix
+        rel_error = (
+            ((estimate - targets_array) + 1) / (targets_array + 1)
+        ) ** 2
+        print(
+            f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}",
+            f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}"
+        )
+        print("Relative error over 100% for:")
+        for i in np.where(rel_error > 1)[0]:
+            print(f"target_name: {loss_matrix.columns[i]}")
+            print(f"target_value: {targets_array[i]}")
+            print(f"estimate_value: {estimate[i]}")
+            print(f"has rel_error: {rel_error.values[i]:.2f}\n")
+        print("---End of reweighting quick diagnostics------")
+
         self.save_dataset(data)
 
 

From 341a3559f4368f65947db8f0ebe4db67e39a671c Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 20:32:47 -0400
Subject: [PATCH 27/56] lint

---
 policyengine_us_data/datasets/cps/enhanced_cps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 17d3e862..0da67ceb 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -256,7 +256,7 @@ def generate(self):
         ) ** 2
         print(
             f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}",
-            f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}"
+            f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}",
         )
         print("Relative error over 100% for:")
         for i in np.where(rel_error > 1)[0]:

From c2ab4b6466de68c8970ac859157bc941fc56287b Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 22:27:46 -0400
Subject: [PATCH 28/56] taking out bad targets

---
 policyengine_us_data/datasets/cps/cps.py      |  1 -
 .../datasets/cps/enhanced_cps.py              | 59 +++++++++++++++++--
 2 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 202f9c69..d9957cbb 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2007,7 +2007,6 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(3)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 0da67ceb..e7a57044 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -40,7 +40,7 @@ def reweight(
     targets_array,
     dropout_rate=0.05,
     log_path="calibration_log.csv",
-    penalty_approach=None,
+    epochs=150,
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -58,8 +58,8 @@ def reweight(
         np.log(original_weights), requires_grad=True, dtype=torch.float32
     )
 
-    # TO DO: replace this with a call to the python reweight.py package.
-    def loss(weights, penalty_approach=penalty_approach):
+    # TODO: replace this functionality from the microcalibrate package.
+    def loss(weights):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -129,7 +129,7 @@ def dropout_weights(weights, p):
 
     start_loss = None
 
-    iterator = trange(500)
+    iterator = trange(epochs)
     performance = pd.DataFrame()
     for i in iterator:
         optimizer.zero_grad()
@@ -229,13 +229,37 @@ def generate(self):
         original_weights = original_weights.values + np.random.normal(
             1, 0.1, len(original_weights)
         )
+
+        bad_targets = [
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "state/RI/adjusted_gross_income/amount/-inf_1",
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "state/RI/adjusted_gross_income/amount/-inf_1",
+            "target_name: nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All",
+        ]
+
+        # Run the optimization procedure to get (close to) minimum loss weights
         for year in range(self.start_year, self.end_year + 1):
             loss_matrix, targets_array = build_loss_matrix(
                 self.input_dataset, year
             )
-
+            zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
             bad_mask = loss_matrix.columns.isin(bad_targets)
-            keep_mask_bool = ~bad_mask
+            keep_mask_bool = ~(zero_mask | bad_mask)
             keep_idx = np.where(keep_mask_bool)[0]
             loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
             targets_array_clean = targets_array[keep_idx]
@@ -245,10 +269,33 @@ def generate(self):
                 original_weights,
                 loss_matrix_clean,
                 targets_array_clean,
+                loss_matrix_clean,
+                targets_array_clean,
                 log_path="calibration_log.csv",
+                epochs=150,
             )
             data["household_weight"][year] = optimised_weights
 
+            print("\n\n---reweighting quick diagnostics----\n")
+            estimate = optimised_weights @ loss_matrix_clean
+            rel_error = (
+                ((estimate - targets_array_clean) + 1)
+                / (targets_array_clean + 1)
+            ) ** 2
+            print(
+                f"rel_error: min: {np.min(rel_error):.2f}, "
+                f"max: {np.max(rel_error):.2f} "
+                f"mean: {np.mean(rel_error):.2f}, "
+                f"median: {np.median(rel_error):.2f}"
+            )
+            print("Relative error over 100% for:")
+            for i in np.where(rel_error > 1)[0]:
+                print(f"target_name: {loss_matrix_clean.columns[i]}")
+                print(f"target_value: {targets_array_clean[i]}")
+                print(f"estimate_value: {estimate[i]}")
+                print(f"has rel_error: {rel_error[i]:.2f}\n")
+            print("---End of reweighting quick diagnostics------")
+
         print("\n\n---reweighting quick diagnostics----\n")
         estimate = optimised_weights @ loss_matrix
         rel_error = (

From 6f7a03a76dc95d7f9ebfd20f1df6240bd11593bc Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 23:05:09 -0400
Subject: [PATCH 29/56] fixing workflow arg passthrough

---
 .github/workflows/pr_code_changes.yaml | 16 +++++++++++++---
 changelog_entry.yaml                   |  6 ++++++
 pyproject.toml                         |  4 ++--
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index c84a4b97..56224a2e 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -11,6 +11,14 @@ on:
       - tests/**
       - .github/workflows/**
 
+  workflow_call:
+    inputs:
+      TEST_LITE:
+        description: 'Run in lite mode'
+        type: boolean
+        required: false
+        default: false
+
 jobs:
   Lint:
     runs-on: ubuntu-latest
@@ -53,6 +61,7 @@ jobs:
       needs: Lint
       env:
         HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
+        TEST_LITE: ${{ inputs.TEST_LITE }}
       steps:
           - name: Checkout repo
             uses: actions/checkout@v2
@@ -63,7 +72,7 @@ jobs:
           - name: Set up Python
             uses: actions/setup-python@v2
             with:
-                python-version: '3.11.12'
+                python-version: '3.11'
           - name: Install package
             run: uv pip install -e .[dev] --system
 
@@ -75,8 +84,9 @@ jobs:
           - name: Build datasets
             run: make data
             env:
-              TEST_LITE: true
-              PYTHON_LOG_LEVEL: INFO  
+              TEST_LITE: ${{ env.TEST_LITE }}
+              PYTHON_LOG_LEVEL: INFO
+
           - name: Save calibration log
             uses: actions/upload-artifact@v4
             with:
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index dcce3f1a..bce8b349 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,4 +1,10 @@
 - bump: patch
   changes:
     changed:
+    - bad targets (causing problems with estimation) removed
     - lite mode now builds CPS_2023 in addition to CPS_2024
+    - gave reweight an epochs argument and set it at 150 for optimization
+    - updating minimum versions on policyengine-us and pandas dependencies
+    fixed:
+    - manual workflow now can call PR code changes
+
diff --git a/pyproject.toml b/pyproject.toml
index 4bec19eb..481cbc37 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,9 +17,9 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us==1.340.1",
+    "policyengine-us>=1.340.1",
     "policyengine-core>=3.14.1",
-    "pandas==2.3.0",
+    "pandas>=2.3.0",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",

From 3dba2a2aa3a578aeaa7e7acde71e53d150669036 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 23:09:32 -0400
Subject: [PATCH 30/56] deps and defaults

---
 .github/workflows/code_changes.yaml    | 2 +-
 .github/workflows/pr_code_changes.yaml | 2 +-
 pyproject.toml                         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml
index c2340d14..edd804db 100644
--- a/.github/workflows/code_changes.yaml
+++ b/.github/workflows/code_changes.yaml
@@ -38,7 +38,7 @@ jobs:
           - name: Set up Python
             uses: actions/setup-python@v2
             with:
-                python-version: '3.11.12'
+                python-version: '3.11'
           - uses: "google-github-actions/auth@v2"
             with:
               workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider"
diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 56224a2e..1e05b564 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -17,7 +17,7 @@ on:
         description: 'Run in lite mode'
         type: boolean
         required: false
-        default: false
+        default: true
 
 jobs:
   Lint:
diff --git a/pyproject.toml b/pyproject.toml
index 481cbc37..f983258d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
     "policyengine-us>=1.340.1",
-    "policyengine-core>=3.14.1",
+    "policyengine-core>=3.17.1",
     "pandas>=2.3.0",
     "requests",
     "tqdm",

From 7710a4cd0f58de7b2120f146228977e9c46f253d Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 23:12:21 -0400
Subject: [PATCH 31/56] wrong pipeline for manual test

---
 .github/workflows/manual_tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml
index fb13ba89..fd6fa061 100644
--- a/.github/workflows/manual_tests.yaml
+++ b/.github/workflows/manual_tests.yaml
@@ -11,7 +11,7 @@ on:
 
 jobs:
   test:
-    uses: ./.github/workflows/code_changes.yaml
+    uses: ./.github/workflows/pr_code_changes.yaml
     with:
       TEST_LITE: ${{ github.event.inputs.test_lite }}
     secrets: inherit

From 27f46fd8d19199fad6006675bcab231da67968af Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 23:30:46 -0400
Subject: [PATCH 32/56] trying again to get the manual test to work

---
 .github/workflows/manual_tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml
index fd6fa061..55667dbc 100644
--- a/.github/workflows/manual_tests.yaml
+++ b/.github/workflows/manual_tests.yaml
@@ -13,5 +13,5 @@ jobs:
   test:
     uses: ./.github/workflows/pr_code_changes.yaml
     with:
-      TEST_LITE: ${{ github.event.inputs.test_lite }}
+      TEST_LITE: ${{ inputs.test_lite }}
     secrets: inherit

From fef1eca57d99d8359f335ac4886eebde5b45c6c9 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 23:53:27 -0400
Subject: [PATCH 33/56] reverting to older workflow code

---
 .github/workflows/manual_tests.yaml    | 17 -----------------
 .github/workflows/pr_code_changes.yaml | 14 ++------------
 changelog_entry.yaml                   |  4 +---
 3 files changed, 3 insertions(+), 32 deletions(-)
 delete mode 100644 .github/workflows/manual_tests.yaml

diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml
deleted file mode 100644
index 55667dbc..00000000
--- a/.github/workflows/manual_tests.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: Manual tests
-
-on:
-  workflow_dispatch:
-    inputs:
-      test_lite:
-        description: 'Run in lite mode'
-        required: true
-        default: true
-        type: boolean
-
-jobs:
-  test:
-    uses: ./.github/workflows/pr_code_changes.yaml
-    with:
-      TEST_LITE: ${{ inputs.test_lite }}
-    secrets: inherit
diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 1e05b564..4e30d089 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -11,14 +11,6 @@ on:
       - tests/**
       - .github/workflows/**
 
-  workflow_call:
-    inputs:
-      TEST_LITE:
-        description: 'Run in lite mode'
-        type: boolean
-        required: false
-        default: true
-
 jobs:
   Lint:
     runs-on: ubuntu-latest
@@ -61,7 +53,6 @@ jobs:
       needs: Lint
       env:
         HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
-        TEST_LITE: ${{ inputs.TEST_LITE }}
       steps:
           - name: Checkout repo
             uses: actions/checkout@v2
@@ -84,9 +75,8 @@ jobs:
           - name: Build datasets
             run: make data
             env:
-              TEST_LITE: ${{ env.TEST_LITE }}
-              PYTHON_LOG_LEVEL: INFO
-
+              TEST_LITE: true
+              PYTHON_LOG_LEVEL: INFO  
           - name: Save calibration log
             uses: actions/upload-artifact@v4
             with:
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index bce8b349..3f9b8627 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -5,6 +5,4 @@
     - lite mode now builds CPS_2023 in addition to CPS_2024
     - gave reweight an epochs argument and set it at 150 for optimization
     - updating minimum versions on policyengine-us and pandas dependencies
-    fixed:
-    - manual workflow now can call PR code changes
-
+    - getting rid of non-working manual workflow code

From 5eb10501cd4e8f33925411de7f4574e3dec413f8 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Mon, 14 Jul 2025 00:12:37 -0400
Subject: [PATCH 34/56] cleaning up enhanced_cps.py

---
 .../datasets/cps/enhanced_cps.py              | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index e7a57044..5c82d724 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -249,7 +249,7 @@ def generate(self):
             "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
             "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
             "state/RI/adjusted_gross_income/amount/-inf_1",
-            "target_name: nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All",
+            "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All",
         ]
 
         # Run the optimization procedure to get (close to) minimum loss weights
@@ -296,23 +296,6 @@ def generate(self):
                 print(f"has rel_error: {rel_error[i]:.2f}\n")
             print("---End of reweighting quick diagnostics------")
 
-        print("\n\n---reweighting quick diagnostics----\n")
-        estimate = optimised_weights @ loss_matrix
-        rel_error = (
-            ((estimate - targets_array) + 1) / (targets_array + 1)
-        ) ** 2
-        print(
-            f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}",
-            f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}",
-        )
-        print("Relative error over 100% for:")
-        for i in np.where(rel_error > 1)[0]:
-            print(f"target_name: {loss_matrix.columns[i]}")
-            print(f"target_value: {targets_array[i]}")
-            print(f"estimate_value: {estimate[i]}")
-            print(f"has rel_error: {rel_error.values[i]:.2f}\n")
-        print("---End of reweighting quick diagnostics------")
-
         self.save_dataset(data)
 
 

From 1fb4318b21072a9c5dbd2824216be49655f0b9b2 Mon Sep 17 00:00:00 2001
From: MaxGhenis <MaxGhenis@users.noreply.github.com>
Date: Mon, 14 Jul 2025 15:33:13 +0000
Subject: [PATCH 35/56] Update package version

---
 CHANGELOG.md         | 11 +++++++++++
 changelog.yaml       |  9 +++++++++
 changelog_entry.yaml |  8 --------
 pyproject.toml       |  2 +-
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6299d8fb..e355d4dd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.37.1] - 2025-07-14 15:33:11
+
+### Changed
+
+- bad targets (causing problems with estimation) removed
+- lite mode now builds CPS_2023 in addition to CPS_2024
+- gave reweight an epochs argument and set it at 150 for optimization
+- updating minimum versions on policyengine-us and pandas dependencies
+- getting rid of non-working manual workflow code
+
 ## [1.37.0] - 2025-07-09 14:58:33
 
 ### Added
@@ -520,6 +530,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[1.37.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.37.0...1.37.1
 [1.37.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.2...1.37.0
 [1.36.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.1...1.36.2
 [1.36.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.0...1.36.1
diff --git a/changelog.yaml b/changelog.yaml
index 699b2430..af7cdf32 100644
--- a/changelog.yaml
+++ b/changelog.yaml
@@ -433,3 +433,12 @@
     added:
     - Medicaid state level calibration targets.
   date: 2025-07-09 14:58:33
+- bump: patch
+  changes:
+    changed:
+    - bad targets (causing problems with estimation) removed
+    - lite mode now builds CPS_2023 in addition to CPS_2024
+    - gave reweight an epochs argument and set it at 150 for optimization
+    - updating minimum versions on policyengine-us and pandas dependencies
+    - getting rid of non-working manual workflow code
+  date: 2025-07-14 15:33:11
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 3f9b8627..e69de29b 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,8 +0,0 @@
-- bump: patch
-  changes:
-    changed:
-    - bad targets (causing problems with estimation) removed
-    - lite mode now builds CPS_2023 in addition to CPS_2024
-    - gave reweight an epochs argument and set it at 150 for optimization
-    - updating minimum versions on policyengine-us and pandas dependencies
-    - getting rid of non-working manual workflow code
diff --git a/pyproject.toml b/pyproject.toml
index f983258d..5a75693f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "policyengine_us_data"
-version = "1.37.0"
+version = "1.37.1"
 description = "A package to create representative microdata for the US."
 readme = "README.md"
 authors = [

From a62328a6f47293f90e1e696d03b49b96c044321b Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 11 Jul 2025 16:24:22 +0200
Subject: [PATCH 36/56] attempting to vectorize minimizing of ecps

---
 changelog_entry.yaml                          |  4 ++
 .../datasets/cps/enhanced_cps.py              | 53 +++++++------------
 policyengine_us_data/utils/minimise.py        | 51 ++++++++++++------
 3 files changed, 59 insertions(+), 49 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e69de29b..84eeb584 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+    - Enhanced CPS minimizing tests.
\ No newline at end of file
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 5c82d724..6616d54c 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -58,8 +58,8 @@ def reweight(
         np.log(original_weights), requires_grad=True, dtype=torch.float32
     )
 
-    # TODO: replace this functionality from the microcalibrate package.
-    def loss(weights):
+    # TO DO: replace this with a call to the python reweight.py package.
+    def loss(weights, penalty_approach="l0_sigmoid"):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -76,43 +76,30 @@ def loss(weights):
         if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
 
-        if penalty_approach is not None:
-            # L0 penalty (approximated with smooth function)
-            # Since L0 is non-differentiable, we use a smooth approximation
-            # Common approaches:
+        # L0 penalty (approximated with smooth function)
+        # Since L0 is non-differentiable, we use a smooth approximation
+        # Common approaches:
 
-            epsilon = 1e-3  # Threshold for "near zero"
-            l0_penalty_weight = 1e-1  # Adjust this hyperparameter
+        epsilon = 1e-3  # Threshold for "near zero"
+        l0_penalty_weight = 1e-1  # Adjust this hyperparameter
 
-            # Option 1: Sigmoid approximation
-            if penalty_approach == "l0_sigmoid":
-                smoothed_l0 = torch.sigmoid(
-                    (weights - epsilon) / (epsilon * 0.1)
-                ).mean()
+        # Option 1: Sigmoid approximation
+        if penalty_approach == "l0_sigmoid":
+            smoothed_l0 = torch.sigmoid(
+                (weights - epsilon) / (epsilon * 0.1)
+            ).mean()
 
-            # Option 2: Log-sum penalty (smoother)
-            if penalty_approach == "l0_log":
-                smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(
-                    weights
-                )
+        # Option 2: Log-sum penalty (smoother)
+        if penalty_approach == "l0_log":
+            smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights)
 
-            # Option 3: Exponential penalty
-            if penalty_approach == "l0_exp":
-                smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
+        # Option 3: Exponential penalty
+        if penalty_approach == "l0_exp":
+            smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
 
-            # L1 penalty
-            l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
+        # L1 penalty
 
-            if penalty_approach == "l1":
-                l1 = torch.mean(weights)
-                return rel_error_normalized.mean() + l1_penalty_weight * l1
-
-            return (
-                rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
-            )
-
-        else:
-            return rel_error_normalized.mean()
+        return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
 
     def dropout_weights(weights, p):
         if p == 0:
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index df193c6e..ca985378 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -5,21 +5,10 @@
 import pandas as pd
 import h5py
 from policyengine_us_data.storage import STORAGE_FOLDER
-from typing import Optional, Callable
+from typing import Optional
 
-bad_targets = [
-    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
-    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
-    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
-    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
-    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
-    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
-    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
-    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
-]
 
-
-def create_calibration_log_file(file_path, epoch=0):
+def create_calibration_log_file(file_path):
     dataset = Dataset.from_file(file_path)
 
     loss_matrix, targets = build_loss_matrix(dataset, 2024)
@@ -112,6 +101,27 @@ def losses_for_candidates(
     return losses
 
 
+def minimise_dataset(
+    dataset, output_path: str, loss_rel_change_max: float
+) -> None:
+    dataset = str(dataset)
+    create_calibration_log_file(dataset)
+
+    dataset = Dataset.from_file(dataset)
+    loss_matrix = build_loss_matrix(dataset, 2024)
+
+    sim = Microsimulation(dataset=dataset)
+
+    weights = sim.calculate("household_weight", 2024).values
+    estimate_matrix, targets = loss_matrix
+    is_national = estimate_matrix.columns.str.startswith("nation/")
+    nation_normalisation_factor = is_national * (1 / is_national.sum())
+    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
+    normalisation_factor = np.where(
+        is_national, nation_normalisation_factor, state_normalisation_factor
+    )
+    weights @ estimate_matrix
+
 def get_loss_from_mask(
     weights, inclusion_mask, estimate_matrix, targets, normalisation_factor
 ):
@@ -185,16 +195,25 @@ def candidate_loss_contribution(
             replace=False,
         )
 
-        # Compute losses for the batch in one shot
+        # more efficient approach to compute losses for candidate households to be removed
+
+        # 1. sample only households that are currently *included*
+        indices = np.random.choice(
+            np.where(full_mask)[0],
+            size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION),
+            replace=False,
+        )
+        # 2. compute losses for the batch in one shot
         candidate_losses = losses_for_candidates(
             weights, indices, estimate_matrix, targets, normalisation_factor
         )
-
-        # Convert to relative change vs. baseline
+        # 3. convert to relative change vs. baseline
         household_loss_rel_changes = (
             candidate_losses - baseline_loss
         ) / baseline_loss
 
+        inclusion_mask = full_mask.copy()
+        household_loss_rel_changes = np.array(household_loss_rel_changes)
         # Sort by the relative change in loss
         sorted_indices = np.argsort(household_loss_rel_changes)
 

From 6d3f8b4daea6ab498b105bf9429b74e52462cde4 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 11 Jul 2025 19:19:58 +0200
Subject: [PATCH 37/56] add notebook with testing functionality (havent tested
 locally)

---
 .../datasets/cps/enhanced_cps.py              |   9 +-
 policyengine_us_data/utils/minimise.py        |   2 +-
 test_minimization_approach.ipynb              | 210 +-----------------
 3 files changed, 16 insertions(+), 205 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 6616d54c..ca53a84d 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -40,7 +40,7 @@ def reweight(
     targets_array,
     dropout_rate=0.05,
     log_path="calibration_log.csv",
-    epochs=150,
+    penalty_approach="l0_sigmoid",
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -59,7 +59,7 @@ def reweight(
     )
 
     # TO DO: replace this with a call to the python reweight.py package.
-    def loss(weights, penalty_approach="l0_sigmoid"):
+    def loss(weights, penalty_approach=penalty_approach):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -98,6 +98,11 @@ def loss(weights, penalty_approach="l0_sigmoid"):
             smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
 
         # L1 penalty
+        l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
+
+        if penalty_approach == "l1":
+            l1 = torch.mean(weights)
+            return rel_error_normalized.mean() + l1_penalty_weight * l1
 
         return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
 
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index ca985378..da2cb7d1 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -386,7 +386,7 @@ def minimise_dataset(
                 f.create_dataset(f"{variable}/{year}", data=value)
 
     print(f"Saved minimised dataset to {output_path}")
-    create_calibration_log_file(output_path, epoch=500)
+    create_calibration_log_file(output_path)    
 
 
 if __name__ == "__main__":
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 54f3c6fa..519d2725 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 5,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -12,199 +12,15 @@
     "from policyengine_us import Microsimulation\n",
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
-    "import numpy as np\n",
-    "import os"
+    "import numpy as np"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6daabe7c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Original ECPS 2024 dataset size (for household entity): 41310\n",
-    "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n",
-    "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
    "id": "db975ac1",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n",
-      "Weight relative change: 99.10%\n",
-      "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
     "\n",
@@ -212,20 +28,18 @@
     "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
     "    ]\n",
     "\n",
-    "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n",
-    "minimization_function = candidate_loss_contribution\n",
-    "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
+    "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n",
+    "minimization_function = random_sampling_minimization\n",
+    "# other minimization function approach is \"candidate_loss_contribution\"\n",
     "\n",
     "for file in files:\n",
     "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
-    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "\n",
     "    minimise_dataset(\n",
     "        file,\n",
     "        output_path,\n",
+    "        loss_rel_change_max=10,\n",
     "        minimization_function=minimization_function, \n",
-    "        # target_fractions=[0.5] # remove if switching approach\n",
-    "        loss_rel_change_max=0.0001,  # remove if switching approach\n",
+    "        target_fractions=[0.5] # remove if switching approach\n",
     "    )"
    ]
   },
@@ -267,14 +81,6 @@
     "\n",
     "data.save_dataset(output_path)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b4cf8e89",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

From 94cacdeab007e318fe849bb3bbf4b29d7fcf627a Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Mon, 14 Jul 2025 15:22:58 +0200
Subject: [PATCH 38/56] few updates to the testing framework

---
 changelog_entry.yaml                          |   2 +-
 .../datasets/cps/enhanced_cps.py              |  58 ++---
 policyengine_us_data/utils/minimise.py        |  59 +++++-
 pyproject.toml                                |   3 +-
 test_minimization_approach.ipynb              | 198 +++++++++++++++++-
 5 files changed, 280 insertions(+), 40 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 84eeb584..ac664753 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,4 +1,4 @@
 - bump: minor
   changes:
     added:
-    - Enhanced CPS minimizing tests.
\ No newline at end of file
+    - Enhanced CPS minimizing tests. 
\ No newline at end of file
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index ca53a84d..bf4b5501 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -40,7 +40,7 @@ def reweight(
     targets_array,
     dropout_rate=0.05,
     log_path="calibration_log.csv",
-    penalty_approach="l0_sigmoid",
+    penalty_approach=None,
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -76,35 +76,43 @@ def loss(weights, penalty_approach=penalty_approach):
         if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
 
-        # L0 penalty (approximated with smooth function)
-        # Since L0 is non-differentiable, we use a smooth approximation
-        # Common approaches:
+        if penalty_approach is not None:
+            # L0 penalty (approximated with smooth function)
+            # Since L0 is non-differentiable, we use a smooth approximation
+            # Common approaches:
 
-        epsilon = 1e-3  # Threshold for "near zero"
-        l0_penalty_weight = 1e-1  # Adjust this hyperparameter
+            epsilon = 1e-3  # Threshold for "near zero"
+            l0_penalty_weight = 1e-1  # Adjust this hyperparameter
 
-        # Option 1: Sigmoid approximation
-        if penalty_approach == "l0_sigmoid":
-            smoothed_l0 = torch.sigmoid(
-                (weights - epsilon) / (epsilon * 0.1)
-            ).mean()
+            # Option 1: Sigmoid approximation
+            if penalty_approach == "l0_sigmoid":
+                smoothed_l0 = torch.sigmoid(
+                    (weights - epsilon) / (epsilon * 0.1)
+                ).mean()
 
-        # Option 2: Log-sum penalty (smoother)
-        if penalty_approach == "l0_log":
-            smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights)
+            # Option 2: Log-sum penalty (smoother)
+            if penalty_approach == "l0_log":
+                smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(
+                    weights
+                )
 
-        # Option 3: Exponential penalty
-        if penalty_approach == "l0_exp":
-            smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
+            # Option 3: Exponential penalty
+            if penalty_approach == "l0_exp":
+                smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
 
-        # L1 penalty
-        l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
+            # L1 penalty
+            l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
 
-        if penalty_approach == "l1":
-            l1 = torch.mean(weights)
-            return rel_error_normalized.mean() + l1_penalty_weight * l1
+            if penalty_approach == "l1":
+                l1 = torch.mean(weights)
+                return rel_error_normalized.mean() + l1_penalty_weight * l1
 
-        return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
+            return (
+                rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
+            )
+
+        else:
+            return rel_error_normalized.mean()
 
     def dropout_weights(weights, p):
         if p == 0:
@@ -249,9 +257,9 @@ def generate(self):
             loss_matrix, targets_array = build_loss_matrix(
                 self.input_dataset, year
             )
-            zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
+
             bad_mask = loss_matrix.columns.isin(bad_targets)
-            keep_mask_bool = ~(zero_mask | bad_mask)
+            keep_mask_bool = ~bad_mask
             keep_idx = np.where(keep_mask_bool)[0]
             loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
             targets_array_clean = targets_array[keep_idx]
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index da2cb7d1..9c3d59eb 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -5,14 +5,33 @@
 import pandas as pd
 import h5py
 from policyengine_us_data.storage import STORAGE_FOLDER
-from typing import Optional
+from typing import Optional, Callable
 
+bad_targets = [
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+]
 
-def create_calibration_log_file(file_path):
+
+def create_calibration_log_file(file_path, epoch=0):
     dataset = Dataset.from_file(file_path)
 
     loss_matrix, targets = build_loss_matrix(dataset, 2024)
 
+    bad_mask = loss_matrix.columns.isin(bad_targets)
+    keep_mask_bool = ~bad_mask
+    keep_idx = np.where(keep_mask_bool)[0]
+    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+    targets_clean = targets[keep_idx]
+    assert loss_matrix_clean.shape[1] == targets_clean.size
+    loss_matrix, targets = build_loss_matrix(dataset, 2024)
+
     bad_mask = loss_matrix.columns.isin(bad_targets)
     keep_mask_bool = ~bad_mask
     keep_idx = np.where(keep_mask_bool)[0]
@@ -22,6 +41,10 @@ def create_calibration_log_file(file_path):
 
     sim = Microsimulation(dataset=dataset)
 
+    estimates = (
+        sim.calculate("household_weight", 2024).values @ loss_matrix_clean
+    )
+    target_names = loss_matrix_clean.columns
     estimates = (
         sim.calculate("household_weight", 2024).values @ loss_matrix_clean
     )
@@ -32,9 +55,11 @@ def create_calibration_log_file(file_path):
             "target_name": target_names,
             "estimate": estimates,
             "target": targets_clean,
+            "target": targets_clean,
         }
     )
     df["epoch"] = epoch
+    df["epoch"] = epoch
     df["error"] = df["estimate"] - df["target"]
     df["rel_error"] = df["error"] / df["target"]
     df["abs_error"] = df["error"].abs()
@@ -43,6 +68,11 @@ def create_calibration_log_file(file_path):
         if df["target"].abs().sum() > 0
         else np.nan
     )
+    df["rel_abs_error"] = (
+        df["abs_error"] / df["target"].abs()
+        if df["target"].abs().sum() > 0
+        else np.nan
+    )
     df["loss"] = (df["rel_error"] ** 2).mean()
 
     df.to_csv(
@@ -258,6 +288,7 @@ def random_sampling_minimization(
     targets,
     normalisation_factor,
     random=True,
+    random=True,
     target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5],
 ):
     """A simple random sampling approach"""
@@ -265,6 +296,8 @@ def random_sampling_minimization(
 
     household_weights_normalized = weights / weights.sum()
 
+    household_weights_normalized = weights / weights.sum()
+
     final_mask = None
     lowest_loss = float("inf")
     for fraction in target_fractions:
@@ -283,6 +316,14 @@ def random_sampling_minimization(
                     replace=False,
                 )
             ] = True
+            mask[
+                np.random.choice(
+                    n,
+                    target_size,
+                    p=household_weights_normalized if random else None,
+                    replace=False,
+                )
+            ] = True
 
             loss = get_loss_from_mask(
                 weights, mask, estimate_matrix, targets, normalisation_factor
@@ -322,6 +363,14 @@ def minimise_dataset(
     dataset = Dataset.from_file(dataset)
     loss_matrix, targets = build_loss_matrix(dataset, 2024)
 
+    bad_mask = loss_matrix.columns.isin(bad_targets)
+    keep_mask_bool = ~bad_mask
+    keep_idx = np.where(keep_mask_bool)[0]
+    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+    targets_clean = targets[keep_idx]
+    assert loss_matrix_clean.shape[1] == targets_clean.size
+    loss_matrix, targets = build_loss_matrix(dataset, 2024)
+
     bad_mask = loss_matrix.columns.isin(bad_targets)
     keep_mask_bool = ~bad_mask
     keep_idx = np.where(keep_mask_bool)[0]
@@ -333,6 +382,7 @@ def minimise_dataset(
 
     weights = sim.calculate("household_weight", 2024).values
     is_national = loss_matrix_clean.columns.str.startswith("nation/")
+    is_national = loss_matrix_clean.columns.str.startswith("nation/")
     nation_normalisation_factor = is_national * (1 / is_national.sum())
     state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
     normalisation_factor = np.where(
@@ -344,8 +394,11 @@ def minimise_dataset(
         weights=weights,
         estimate_matrix=loss_matrix_clean,
         targets=targets_clean,
+        estimate_matrix=loss_matrix_clean,
+        targets=targets_clean,
         normalisation_factor=normalisation_factor,
         **kwargs,  # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
+        **kwargs,  # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
     )
 
     # Extract household IDs for remaining households
@@ -386,7 +439,7 @@ def minimise_dataset(
                 f.create_dataset(f"{variable}/{year}", data=value)
 
     print(f"Saved minimised dataset to {output_path}")
-    create_calibration_log_file(output_path)    
+    create_calibration_log_file(output_path, epoch=500)
 
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
index 5a75693f..7f3e59b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,9 +17,8 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us>=1.340.1",
+    "policyengine-us>=1.340.0",
     "policyengine-core>=3.17.1",
-    "pandas>=2.3.0",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 519d2725..5a7a9d15 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -12,15 +12,188 @@
     "from policyengine_us import Microsimulation\n",
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "import os\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "db975ac1",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n",
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n",
+      "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Weight relative change: 52.19%\n",
+      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n"
+     ]
+    }
+   ],
    "source": [
     "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
     "\n",
@@ -28,18 +201,17 @@
     "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
     "    ]\n",
     "\n",
-    "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n",
-    "minimization_function = random_sampling_minimization\n",
-    "# other minimization function approach is \"candidate_loss_contribution\"\n",
+    "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n",
+    "minimization_function = candidate_loss_contribution\n",
+    "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n",
     "\n",
     "for file in files:\n",
     "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
     "    minimise_dataset(\n",
     "        file,\n",
     "        output_path,\n",
-    "        loss_rel_change_max=10,\n",
     "        minimization_function=minimization_function, \n",
-    "        target_fractions=[0.5] # remove if switching approach\n",
+    "        #target_fractions=[0.5] # remove if switching approach\n",
     "    )"
    ]
   },
@@ -81,6 +253,14 @@
     "\n",
     "data.save_dataset(output_path)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4cf8e89",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From a71530b7b6f2723cfbf54a64f8f28f9d77e6da1d Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Mon, 14 Jul 2025 19:56:54 +0200
Subject: [PATCH 39/56] fix calibration for each approach

---
 .../datasets/cps/enhanced_cps.py              |  1 +
 policyengine_us_data/utils/loss.py            |  5 --
 policyengine_us_data/utils/minimise.py        | 89 ++++++++++++++-----
 test_minimization_approach.ipynb              | 86 ++++++++++++------
 4 files changed, 129 insertions(+), 52 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index bf4b5501..33f62929 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -39,6 +39,7 @@ def reweight(
     loss_matrix,
     targets_array,
     dropout_rate=0.05,
+    epochs=500,
     log_path="calibration_log.csv",
     penalty_approach=None,
 ):
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
index 21abce0f..fbdbacef 100644
--- a/policyengine_us_data/utils/loss.py
+++ b/policyengine_us_data/utils/loss.py
@@ -552,11 +552,6 @@ def build_loss_matrix(dataset: type, time_period):
         # Convert to thousands for the target
         targets_array.append(row["enrollment"])
 
-        print(
-            f"Targeting Medicaid enrollment for {row['state']} "
-            f"with target {row['enrollment']:.0f}k"
-        )
-
     # State 10-year age targets
 
     age_targets = pd.read_csv(STORAGE_FOLDER / "age_state.csv")
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 9c3d59eb..84c55d31 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -6,6 +6,7 @@
 import h5py
 from policyengine_us_data.storage import STORAGE_FOLDER
 from typing import Optional, Callable
+from policyengine_us_data.datasets.cps.enhanced_cps import reweight
 
 bad_targets = [
     "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
@@ -20,35 +21,54 @@
 
 
 def create_calibration_log_file(file_path, epoch=0):
+    print(f"=== CALIBRATION LOG DEBUG ===")
+    print(f"File path: {file_path}")
+    print(f"Epoch: {epoch}")
+
     dataset = Dataset.from_file(file_path)
+    sim = Microsimulation(dataset=dataset)
 
-    loss_matrix, targets = build_loss_matrix(dataset, 2024)
+    # Debug: Print dataset info
+    household_weights = sim.calculate("household_weight", 2024)
+    print(f"Number of households: {len(household_weights)}")
+    print(f"Total weight: {household_weights.sum():.2f}")
+    print(
+        f"Weight range: {household_weights.min():.2f} to {household_weights.max():.2f}"
+    )
 
-    bad_mask = loss_matrix.columns.isin(bad_targets)
-    keep_mask_bool = ~bad_mask
-    keep_idx = np.where(keep_mask_bool)[0]
-    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
-    targets_clean = targets[keep_idx]
-    assert loss_matrix_clean.shape[1] == targets_clean.size
     loss_matrix, targets = build_loss_matrix(dataset, 2024)
+    print(f"Loss matrix shape: {loss_matrix.shape}")
+    print(f"Number of targets: {len(targets)}")
 
     bad_mask = loss_matrix.columns.isin(bad_targets)
     keep_mask_bool = ~bad_mask
     keep_idx = np.where(keep_mask_bool)[0]
     loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
     targets_clean = targets[keep_idx]
-    assert loss_matrix_clean.shape[1] == targets_clean.size
 
-    sim = Microsimulation(dataset=dataset)
+    print(f"After filtering bad targets:")
+    print(f"Loss matrix clean shape: {loss_matrix_clean.shape}")
+    print(f"Number of clean targets: {len(targets_clean)}")
+
+    assert loss_matrix_clean.shape[1] == targets_clean.size
 
     estimates = (
         sim.calculate("household_weight", 2024).values @ loss_matrix_clean
     )
     target_names = loss_matrix_clean.columns
-    estimates = (
-        sim.calculate("household_weight", 2024).values @ loss_matrix_clean
-    )
-    target_names = loss_matrix_clean.columns
+
+    # Debug: Print estimate statistics
+    print(f"Estimates shape: {estimates.shape}")
+    print(f"Estimates sum: {estimates.sum():.2f}")
+    print(f"First 3 estimates: {estimates[:3]}")
+    print(f"First 3 targets: {targets_clean[:3]}")
+
+    # Calculate and print some key metrics
+    errors = estimates - targets_clean
+    rel_errors = errors / targets_clean
+    print(f"Mean absolute error: {np.abs(errors).mean():.2f}")
+    print(f"Mean relative error: {np.abs(rel_errors).mean():.4f}")
+    print(f"=== END DEBUG ===\n")
 
     df = pd.DataFrame(
         {
@@ -158,6 +178,7 @@ def get_loss_from_mask(
     """
     Calculate the loss based on the inclusion mask and the estimate matrix.
     """
+    # Step 1: Apply mask and rescale weights
     masked_weights = weights.copy()
     original_weight_total = masked_weights.sum()
     if (~inclusion_mask).sum() > 0:
@@ -166,7 +187,26 @@ def get_loss_from_mask(
     masked_weights[inclusion_mask] *= (
         original_weight_total / masked_weight_total
     )
-    estimates = masked_weights @ estimate_matrix
+
+    # Step 2: Re-calibrate the masked weights to hit targets
+    # Only calibrate the included households
+    included_weights = masked_weights[inclusion_mask]
+    included_estimate_matrix = estimate_matrix[inclusion_mask]
+
+    # Call reweight function to calibrate the selected households
+    calibrated_weights_included = reweight(
+        included_weights,
+        included_estimate_matrix,
+        targets,
+        epochs=250,
+    )
+
+    # Put calibrated weights back into full array
+    calibrated_weights = np.zeros_like(masked_weights)
+    calibrated_weights[inclusion_mask] = calibrated_weights_included
+
+    # Calculate estimates and loss from calibrated weights
+    estimates = calibrated_weights @ estimate_matrix
     rel_error = ((estimates - targets) + 1) / (targets + 1)
     loss = ((rel_error * normalisation_factor) ** 2).mean()
 
@@ -288,8 +328,7 @@ def random_sampling_minimization(
     targets,
     normalisation_factor,
     random=True,
-    random=True,
-    target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5],
+    target_fractions=[0.5, 0.6, 0.7, 0.8, 0.9],
 ):
     """A simple random sampling approach"""
     n = len(weights)
@@ -306,7 +345,7 @@ def random_sampling_minimization(
         best_mask = None
         best_loss = float("inf")
 
-        for _ in range(5):  # Try 5 random samples
+        for _ in range(3):  # Try 3 random samples
             mask = np.zeros(n, dtype=bool)
             mask[
                 np.random.choice(
@@ -419,12 +458,20 @@ def minimise_dataset(
     sim = Microsimulation(dataset=smaller_df)
 
     # Rescale weights to maintain total
-    sim.set_input(
-        "household_weight",
-        2024,
-        sim.calculate("household_weight", 2024).values / weight_rel_change,
+    initial_weights = (
+        sim.calculate("household_weight", 2024).values / weight_rel_change
     )
 
+    # Re-calibrate the final selected households to hit targets
+    print("Re-calibrating final selected households...")
+    calibrated_weights = reweight(
+        initial_weights,
+        loss_matrix_clean.values,  # Convert to numpy array
+        targets_clean,
+        epochs=250,  # Reduced epochs for faster processing
+    )
+    sim.set_input("household_weight", 2024, calibrated_weights)
+    print("Final calibration completed successfully")
     # Prepare data for saving
     data = {}
     for variable in sim.input_variables:
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 5a7a9d15..6683da0c 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -13,12 +13,27 @@
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
     "import numpy as np\n",
-    "import os\n"
+    "import os"
    ]
   },
   {
    "cell_type": "code",
+<<<<<<< HEAD
    "execution_count": 7,
+=======
+   "execution_count": null,
+   "id": "6daabe7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Original ECPS 2024 dataset size (for household entity): 41310\n",
+    "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n",
+    "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "id": "db975ac1",
    "metadata": {},
    "outputs": [
@@ -128,18 +143,17 @@
       "Targeting Medicaid enrollment for WI with target 1108320k\n",
       "Targeting Medicaid enrollment for WV with target 467632k\n",
       "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n",
+      "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n",
+      "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n",
+      "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n",
+      "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Weight relative change: 52.19%\n",
-      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
+      "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n",
+      "Weight relative change: 99.10%\n",
+      "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
       "Targeting Medicaid enrollment for AK with target 231577k\n",
       "Targeting Medicaid enrollment for AL with target 766009k\n",
       "Targeting Medicaid enrollment for AR with target 733561k\n",
@@ -203,32 +217,38 @@
     "\n",
     "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n",
     "minimization_function = candidate_loss_contribution\n",
-    "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n",
+    "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
     "\n",
     "for file in files:\n",
     "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
     "    minimise_dataset(\n",
     "        file,\n",
     "        output_path,\n",
     "        minimization_function=minimization_function, \n",
-    "        #target_fractions=[0.5] # remove if switching approach\n",
+    "        # target_fractions=[0.5] # remove if switching approach\n",
+    "        loss_rel_change_max=0.0001,  # remove if switching approach\n",
     "    )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "35892c9d",
+   "execution_count": 4,
+   "id": "b4cf8e89",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 250/250 [01:24<00:00,  2.98it/s, loss=3.37e-5, loss_rel_change=-0.92] \n"
+     ]
+    }
+   ],
    "source": [
-    "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
-    "\n",
     "input_dataset = ExtendedCPS_2024\n",
     "\n",
-    "approach = \"l0_sigmoid\"\n",
-    "# other options are \"l0_log\", \"l0_exp\", \"l1\"\n",
-    "\n",
     "sim = Microsimulation(dataset=input_dataset)\n",
     "data = sim.dataset.load_dataset()\n",
     "data[\"household_weight\"] = {}\n",
@@ -240,18 +260,32 @@
     "    loss_matrix, targets_array = build_loss_matrix(\n",
     "        input_dataset, year\n",
     "    )\n",
+    "\n",
+    "    bad_mask = loss_matrix.columns.isin(bad_targets)\n",
+    "    keep_mask_bool = ~bad_mask\n",
+    "    keep_idx = np.where(keep_mask_bool)[0]\n",
+    "    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n",
+    "    targets_array_clean = targets_array[keep_idx]\n",
+    "    assert loss_matrix_clean.shape[1] == targets_array_clean.size\n",
+    "    assert loss_matrix_clean.shape[1] != loss_matrix.shape[1]\n",
+    "\n",
     "    optimised_weights = reweight(\n",
     "        original_weights,\n",
-    "        loss_matrix,\n",
-    "        targets_array,\n",
-    "        log_path= STORAGE_FOLDER / approach / \"calibration_log.csv\",\n",
-    "        penalty_approach=approach,\n",
+    "        loss_matrix_clean,\n",
+    "        targets_array_clean,\n",
+    "        log_path=\"baseline_calibration_log.csv\",\n",
+    "        epochs=250,  # Reduced epochs for faster processing\n",
     "    )\n",
     "    data[\"household_weight\"][year] = optimised_weights\n",
     "\n",
-    "output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "output_path = STORAGE_FOLDER / \"baseline\" / \"enhanced_cps_2024_baseline.h5\"\n",
+    "output_path.parent.mkdir(parents=True, exist_ok=True)\n",
     "\n",
-    "data.save_dataset(output_path)"
+    "# Save to HDF5 file\n",
+    "with h5py.File(output_path, \"w\") as f:\n",
+    "    for variable, values in data.items():\n",
+    "        for year, value in values.items():\n",
+    "            f.create_dataset(f\"{variable}/{year}\", data=value)"
    ]
   },
   {

From f146620a9c71761336d7b1c49ae5e54b09f100e4 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Mon, 14 Jul 2025 20:19:38 +0200
Subject: [PATCH 40/56] fixed testing framework

---
 policyengine_us_data/utils/minimise.py |  39 +--
 test_minimization_approach.ipynb       | 330 ++++++++++---------------
 2 files changed, 134 insertions(+), 235 deletions(-)

diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 84c55d31..b3e0ed1a 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -75,11 +75,9 @@ def create_calibration_log_file(file_path, epoch=0):
             "target_name": target_names,
             "estimate": estimates,
             "target": targets_clean,
-            "target": targets_clean,
         }
     )
     df["epoch"] = epoch
-    df["epoch"] = epoch
     df["error"] = df["estimate"] - df["target"]
     df["rel_error"] = df["error"] / df["target"]
     df["abs_error"] = df["error"].abs()
@@ -88,11 +86,6 @@ def create_calibration_log_file(file_path, epoch=0):
         if df["target"].abs().sum() > 0
         else np.nan
     )
-    df["rel_abs_error"] = (
-        df["abs_error"] / df["target"].abs()
-        if df["target"].abs().sum() > 0
-        else np.nan
-    )
     df["loss"] = (df["rel_error"] ** 2).mean()
 
     df.to_csv(
@@ -172,6 +165,7 @@ def minimise_dataset(
     )
     weights @ estimate_matrix
 
+
 def get_loss_from_mask(
     weights, inclusion_mask, estimate_matrix, targets, normalisation_factor
 ):
@@ -264,15 +258,6 @@ def candidate_loss_contribution(
             size=int(full_mask.sum() * view_fraction_per_iteration),
             replace=False,
         )
-
-        # more efficient approach to compute losses for candidate households to be removed
-
-        # 1. sample only households that are currently *included*
-        indices = np.random.choice(
-            np.where(full_mask)[0],
-            size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION),
-            replace=False,
-        )
         # 2. compute losses for the batch in one shot
         candidate_losses = losses_for_candidates(
             weights, indices, estimate_matrix, targets, normalisation_factor
@@ -335,8 +320,6 @@ def random_sampling_minimization(
 
     household_weights_normalized = weights / weights.sum()
 
-    household_weights_normalized = weights / weights.sum()
-
     final_mask = None
     lowest_loss = float("inf")
     for fraction in target_fractions:
@@ -355,14 +338,6 @@ def random_sampling_minimization(
                     replace=False,
                 )
             ] = True
-            mask[
-                np.random.choice(
-                    n,
-                    target_size,
-                    p=household_weights_normalized if random else None,
-                    replace=False,
-                )
-            ] = True
 
             loss = get_loss_from_mask(
                 weights, mask, estimate_matrix, targets, normalisation_factor
@@ -402,14 +377,6 @@ def minimise_dataset(
     dataset = Dataset.from_file(dataset)
     loss_matrix, targets = build_loss_matrix(dataset, 2024)
 
-    bad_mask = loss_matrix.columns.isin(bad_targets)
-    keep_mask_bool = ~bad_mask
-    keep_idx = np.where(keep_mask_bool)[0]
-    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
-    targets_clean = targets[keep_idx]
-    assert loss_matrix_clean.shape[1] == targets_clean.size
-    loss_matrix, targets = build_loss_matrix(dataset, 2024)
-
     bad_mask = loss_matrix.columns.isin(bad_targets)
     keep_mask_bool = ~bad_mask
     keep_idx = np.where(keep_mask_bool)[0]
@@ -421,7 +388,6 @@ def minimise_dataset(
 
     weights = sim.calculate("household_weight", 2024).values
     is_national = loss_matrix_clean.columns.str.startswith("nation/")
-    is_national = loss_matrix_clean.columns.str.startswith("nation/")
     nation_normalisation_factor = is_national * (1 / is_national.sum())
     state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
     normalisation_factor = np.where(
@@ -433,11 +399,8 @@ def minimise_dataset(
         weights=weights,
         estimate_matrix=loss_matrix_clean,
         targets=targets_clean,
-        estimate_matrix=loss_matrix_clean,
-        targets=targets_clean,
         normalisation_factor=normalisation_factor,
         **kwargs,  # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
-        **kwargs,  # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
     )
 
     # Extract household IDs for remaining households
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 6683da0c..7c416e2a 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -13,228 +13,172 @@
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
     "import numpy as np\n",
-    "import os"
+    "import os\n",
+    "import h5py\n",
+    "\n",
+    "bad_targets = [\n",
+    "    \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household\",\n",
+    "    \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household\",\n",
+    "    \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n",
+    "    \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n",
+    "    \"nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household\",\n",
+    "    \"nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household\",\n",
+    "    \"nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n",
+    "    \"nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n",
+    "]"
    ]
   },
   {
    "cell_type": "code",
-<<<<<<< HEAD
-   "execution_count": 7,
-=======
    "execution_count": null,
-   "id": "6daabe7c",
+   "id": "683fd57e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Original ECPS 2024 dataset size (for household entity): 41310\n",
-    "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n",
-    "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n"
+    "# Length of household entity in the dataset measured through household_weight:\n",
+    "\n",
+    "# Original ECPS 2024 dataset size: 41310\n",
+    "# Through \"random_sampling_minimization\" with 0.5 of the dataset being pruned: 20655\n",
+    "# Through \"random_sampling_minimization\" with 0.2 of the dataset being pruned: 33408\n",
+    "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change: 20655 \n",
+    "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change: 24786"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "db975ac1",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n",
-      "Weight relative change: 99.10%\n",
-      "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
+    "## ALL TESTS\n",
+    "\n",
+    "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
+    "\n",
+    "input_dataset = ExtendedCPS_2024\n",
+    "\n",
+    "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
+    "\n",
+    "for approach in approaches:\n",
+    "    sim = Microsimulation(dataset=input_dataset)\n",
+    "    data = sim.dataset.load_dataset()\n",
+    "    data[\"household_weight\"] = {}\n",
+    "    original_weights = sim.calculate(\"household_weight\")\n",
+    "    original_weights = original_weights.values + np.random.normal(\n",
+    "        1, 0.1, len(original_weights)\n",
+    "    )\n",
+    "    for year in range(2024, 2025):\n",
+    "        loss_matrix, targets_array = build_loss_matrix(\n",
+    "            input_dataset, year\n",
+    "        )\n",
+    "\n",
+    "        bad_mask = loss_matrix.columns.isin(bad_targets)\n",
+    "        keep_mask_bool = ~bad_mask\n",
+    "        keep_idx = np.where(keep_mask_bool)[0]\n",
+    "        loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n",
+    "        targets_array_clean = targets_array[keep_idx]\n",
+    "        assert loss_matrix_clean.shape[1] == targets_array_clean.size\n",
+    "\n",
+    "        optimised_weights = reweight(\n",
+    "            original_weights,\n",
+    "            loss_matrix_clean,\n",
+    "            targets_array_clean,\n",
+    "            log_path=\"calibration_log.csv\",\n",
+    "            penalty_approach=approach,\n",
+    "            epochs=250,  # Reduced epochs for faster processing\n",
+    "        )\n",
+    "        data[\"household_weight\"][year] = optimised_weights\n",
+    "\n",
+    "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "    # Save to HDF5 file\n",
+    "    with h5py.File(output_path, \"w\") as f:\n",
+    "        for variable, values in data.items():\n",
+    "            for year, value in values.items():\n",
+    "                f.create_dataset(f\"{variable}/{year}\", data=value)\n",
+    "\n",
+    "\n",
     "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
     "\n",
     "files = [\n",
     "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
     "    ]\n",
     "\n",
-    "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n",
-    "minimization_function = candidate_loss_contribution\n",
+    "approaches = {\n",
+    "        \"random_sampling_minimization\": random_sampling_minimization,\n",
+    "        \"candidate_loss_contribution\": candidate_loss_contribution,\n",
+    "}\n",
+    "\n",
+    "optional_params = {\n",
+    "        \"random_sampling_minimization\": {\n",
+    "            \"target_fractions\": [0.5, 0.6, 0.7, 0.8, 0.9],  # fractions of the dataset to keep\n",
+    "        },\n",
+    "        \"candidate_loss_contribution\": {\n",
+    "            \"loss_rel_change_max\": [0.00001, 0.000001, 0.0000001] # maximum relative change in\n",
+    "        }\n",
+    "}\n",
+    "\n",
+    "for approach, function in approaches.items():\n",
+    "    minimization_function = function\n",
+    "    # other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
+    "\n",
+    "    for params, values in optional_params[approach].items():\n",
+    "        for value in values:\n",
+    "            if params == \"target_fractions\":\n",
+    "                for file in files:\n",
+    "                    output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n",
+    "                    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "                    minimise_dataset(\n",
+    "                        file,\n",
+    "                        output_path,\n",
+    "                        minimization_function=minimization_function, \n",
+    "                        target_fractions=[value]\n",
+    "                    )\n",
+    "            elif params == \"loss_rel_change_max\":\n",
+    "                for file in files:\n",
+    "                    output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n",
+    "                    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "                    minimise_dataset(\n",
+    "                        file,\n",
+    "                        output_path,\n",
+    "                        minimization_function=minimization_function, \n",
+    "                        loss_rel_change_max=value\n",
+    "                    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35892c9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## SMALL CHECKS BELOW -- IGNORE ---\n",
+    "\n",
+    "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
+    "\n",
+    "files = [\n",
+    "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
+    "    ]\n",
+    "\n",
+    "minimization_function = random_sampling_minimization\n",
     "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
     "\n",
     "for file in files:\n",
-    "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "    output_path = STORAGE_FOLDER / \"random_sampling_minimization\" / f\"{1.0}_enhanced_cps_2024_minimised.h5\"\n",
     "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "\n",
     "    minimise_dataset(\n",
     "        file,\n",
     "        output_path,\n",
     "        minimization_function=minimization_function, \n",
-    "        # target_fractions=[0.5] # remove if switching approach\n",
-    "        loss_rel_change_max=0.0001,  # remove if switching approach\n",
+    "        target_fractions=[1.0]\n",
     "    )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "b4cf8e89",
    "metadata": {},
    "outputs": [
@@ -287,14 +231,6 @@
     "        for year, value in values.items():\n",
     "            f.create_dataset(f\"{variable}/{year}\", data=value)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b4cf8e89",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

From 51d9c9c90c632db53fb7b9c0e5fe24f319859b17 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 11:09:12 +0200
Subject: [PATCH 41/56] attempt generating Minimized ECPS

---
 changelog_entry.yaml                          |   2 +-
 .../datasets/cps/enhanced_cps.py              |  74 +++--
 .../tests/test_datasets/test_enhanced_cps.py  |  59 ++++
 policyengine_us_data/utils/minimise.py        |  89 ++----
 test_minimization_approach.ipynb              | 257 ------------------
 5 files changed, 140 insertions(+), 341 deletions(-)
 delete mode 100644 test_minimization_approach.ipynb

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index ac664753..725035b9 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,4 +1,4 @@
 - bump: minor
   changes:
     added:
-    - Enhanced CPS minimizing tests. 
\ No newline at end of file
+    - Minimized Enhanced CPS. 
\ No newline at end of file
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 83fe6b99..82aa9f27 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -9,6 +9,10 @@
 import numpy as np
 from typing import Type
 from policyengine_us_data.storage import STORAGE_FOLDER
+from policyengine_us_data.utils.minimise import (
+    candidate_loss_contribution,
+    minimize_dataset,
+)
 from policyengine_us_data.datasets.cps.extended_cps import (
     ExtendedCPS_2024,
     CPS_2019,
@@ -231,28 +235,6 @@ def generate(self):
             1, 0.1, len(original_weights)
         )
 
-        bad_targets = [
-            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
-            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
-            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
-            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
-            "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
-            "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
-            "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
-            "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
-            "state/RI/adjusted_gross_income/amount/-inf_1",
-            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
-            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
-            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
-            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
-            "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
-            "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
-            "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
-            "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
-            "state/RI/adjusted_gross_income/amount/-inf_1",
-            "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All",
-        ]
-
         # Run the optimization procedure to get (close to) minimum loss weights
         for year in range(self.start_year, self.end_year + 1):
             loss_matrix, targets_array = build_loss_matrix(
@@ -327,6 +309,53 @@ def generate(self):
         self.save_dataset(data)
 
 
+class MinimizedEnhancedCPS_2024(Dataset):
+    input_dataset = ExtendedCPS_2024
+    start_year = 2024
+    name = "minimized_enhanced_cps_2024"
+    label = "Minimized Enhanced CPS 2024"
+    file_path = STORAGE_FOLDER / "minimized_enhanced_cps_2024.h5"
+    url = (
+        "hf://policyengine/policyengine-us-data/minimized_enhanced_cps_2024.h5"
+    )
+
+    def generate(self):
+        from policyengine_us import Microsimulation
+
+        sim = Microsimulation(dataset=self.input_dataset)
+        data = sim.dataset.load_dataset()
+        data["household_weight"] = {}
+        original_weights = sim.calculate("household_weight")
+        original_weights = original_weights.values + np.random.normal(
+            1, 0.1, len(original_weights)
+        )
+
+        # Run the optimization procedure to get (close to) minimum loss weights
+        for year in range(self.start_year, self.end_year + 1):
+            loss_matrix, targets_array = build_loss_matrix(
+                self.input_dataset, year
+            )
+
+            bad_mask = loss_matrix.columns.isin(bad_targets)
+            keep_mask_bool = ~bad_mask
+            keep_idx = np.where(keep_mask_bool)[0]
+            loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+            targets_array_clean = targets_array[keep_idx]
+            assert loss_matrix_clean.shape[1] == targets_array_clean.size
+
+        minimize_dataset(
+            self.input_dataset,
+            self.file_path,
+            minimization_function=candidate_loss_contribution,
+            loss_matrix=loss_matrix_clean,
+            targets=targets_array_clean,
+            target_fractions=[0.1],  # maximum relative change in loss
+            count_iterations=5,
+            view_fraction_per_iteration=0.5,
+            fraction_remove_per_iteration=0.1,
+        )
+
+
 class EnhancedCPS_2024(EnhancedCPS):
     input_dataset = ExtendedCPS_2024
     start_year = 2024
@@ -339,3 +368,4 @@ class EnhancedCPS_2024(EnhancedCPS):
 
 if __name__ == "__main__":
     EnhancedCPS_2024().generate()
+    MinimizedEnhancedCPS_2024().generate()
diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
index abf67301..c6660f66 100644
--- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
+++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
@@ -1,4 +1,5 @@
 import pytest
+import pandas as pd
 
 
 def test_ecps_has_mortgage_interest():
@@ -254,3 +255,61 @@ def test_medicaid_calibration():
     assert (
         not failed
     ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}."
+
+
+def test_minimized_enhanced_cps_calibration_quality():
+    """
+    Test that minimized Enhanced CPS datasets maintain calibration quality above 75%.
+    Quality score formula: ((excellentCount * 100 + goodCount * 75) / totalTargets)
+
+    Quality Categories:
+    - Excellent (< 5% error): 100 points each
+    - Good (5-20% error): 75 points each
+    - Poor (≥ 20% error): 0 points each
+    """
+    from policyengine_us_data.datasets.cps import MinimizedEnhancedCPS_2024
+    from policyengine_us_data.utils.minimise import create_calibration_log_file
+    from policyengine_us import Microsimulation
+
+    sim = Microsimulation(dataset=MinimizedEnhancedCPS_2024)
+    assert (
+        len(sim.calculate("household_weight")) < 30_000
+    ), "Minimized Enhanced CPS should have fewer than 30,000 households."
+
+    create_calibration_log_file(MinimizedEnhancedCPS_2024)
+
+    calibration_log = pd.read_csv(
+        str(MinimizedEnhancedCPS_2024.file_path).replace(
+            ".h5", "_calibration_log.csv"
+        )
+    )
+
+    # Calculate quality categories
+    excellent_count = (
+        calibration_log["rel_abs_error"] < 0.05
+    ).sum()  # < 5% error
+    good_count = (
+        (calibration_log["rel_abs_error"] >= 0.05)
+        & (calibration_log["rel_abs_error"] < 0.20)
+    ).sum()  # 5-20% error
+    poor_count = (
+        calibration_log["rel_abs_error"] >= 0.20
+    ).sum()  # ≥ 20% error
+    total_targets = len(calibration_log)
+
+    # Calculate quality score
+    quality_score = (excellent_count * 100 + good_count * 75) / total_targets
+
+    print(f"  Total targets: {total_targets}")
+    print(f"  Excellent (< 5% error): {excellent_count}")
+    print(f"  Good (5-20% error): {good_count}")
+    print(f"  Poor (≥ 20% error): {poor_count}")
+    print(f"  Quality score: {quality_score:.1f}%")
+
+    # Assert quality score is above 75%
+    assert quality_score >= 75.0, (
+        f"Calibration quality score {quality_score:.1f}% is below 75% threshold "
+        f"for {MinimizedEnhancedCPS_2024.label}. "
+        f"Breakdown: {excellent_count} excellent, {good_count} good, {poor_count} poor "
+        f"out of {total_targets} total targets."
+    )
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index b3e0ed1a..2048ce61 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -21,24 +21,10 @@
 
 
 def create_calibration_log_file(file_path, epoch=0):
-    print(f"=== CALIBRATION LOG DEBUG ===")
-    print(f"File path: {file_path}")
-    print(f"Epoch: {epoch}")
-
     dataset = Dataset.from_file(file_path)
     sim = Microsimulation(dataset=dataset)
 
-    # Debug: Print dataset info
-    household_weights = sim.calculate("household_weight", 2024)
-    print(f"Number of households: {len(household_weights)}")
-    print(f"Total weight: {household_weights.sum():.2f}")
-    print(
-        f"Weight range: {household_weights.min():.2f} to {household_weights.max():.2f}"
-    )
-
     loss_matrix, targets = build_loss_matrix(dataset, 2024)
-    print(f"Loss matrix shape: {loss_matrix.shape}")
-    print(f"Number of targets: {len(targets)}")
 
     bad_mask = loss_matrix.columns.isin(bad_targets)
     keep_mask_bool = ~bad_mask
@@ -46,10 +32,6 @@ def create_calibration_log_file(file_path, epoch=0):
     loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
     targets_clean = targets[keep_idx]
 
-    print(f"After filtering bad targets:")
-    print(f"Loss matrix clean shape: {loss_matrix_clean.shape}")
-    print(f"Number of clean targets: {len(targets_clean)}")
-
     assert loss_matrix_clean.shape[1] == targets_clean.size
 
     estimates = (
@@ -57,18 +39,9 @@ def create_calibration_log_file(file_path, epoch=0):
     )
     target_names = loss_matrix_clean.columns
 
-    # Debug: Print estimate statistics
-    print(f"Estimates shape: {estimates.shape}")
-    print(f"Estimates sum: {estimates.sum():.2f}")
-    print(f"First 3 estimates: {estimates[:3]}")
-    print(f"First 3 targets: {targets_clean[:3]}")
-
     # Calculate and print some key metrics
     errors = estimates - targets_clean
     rel_errors = errors / targets_clean
-    print(f"Mean absolute error: {np.abs(errors).mean():.2f}")
-    print(f"Mean relative error: {np.abs(rel_errors).mean():.4f}")
-    print(f"=== END DEBUG ===\n")
 
     df = pd.DataFrame(
         {
@@ -144,28 +117,6 @@ def losses_for_candidates(
     return losses
 
 
-def minimise_dataset(
-    dataset, output_path: str, loss_rel_change_max: float
-) -> None:
-    dataset = str(dataset)
-    create_calibration_log_file(dataset)
-
-    dataset = Dataset.from_file(dataset)
-    loss_matrix = build_loss_matrix(dataset, 2024)
-
-    sim = Microsimulation(dataset=dataset)
-
-    weights = sim.calculate("household_weight", 2024).values
-    estimate_matrix, targets = loss_matrix
-    is_national = estimate_matrix.columns.str.startswith("nation/")
-    nation_normalisation_factor = is_national * (1 / is_national.sum())
-    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
-    normalisation_factor = np.where(
-        is_national, nation_normalisation_factor, state_normalisation_factor
-    )
-    weights @ estimate_matrix
-
-
 def get_loss_from_mask(
     weights, inclusion_mask, estimate_matrix, targets, normalisation_factor
 ):
@@ -185,7 +136,9 @@ def get_loss_from_mask(
     # Step 2: Re-calibrate the masked weights to hit targets
     # Only calibrate the included households
     included_weights = masked_weights[inclusion_mask]
-    included_estimate_matrix = estimate_matrix[inclusion_mask]
+    included_estimate_matrix = estimate_matrix.iloc[
+        inclusion_mask
+    ]  # Keep as DataFrame
 
     # Call reweight function to calibrate the selected households
     calibrated_weights_included = reweight(
@@ -354,10 +307,12 @@ def random_sampling_minimization(
     return final_mask
 
 
-def minimise_dataset(
+def minimize_dataset(
     dataset,
     output_path: str,
     minimization_function: Callable = candidate_loss_contribution,
+    loss_matrix: Optional[pd.DataFrame] = None,
+    targets: Optional[np.ndarray] = None,
     **kwargs,
 ) -> None:
     """
@@ -375,14 +330,15 @@ def minimise_dataset(
     create_calibration_log_file(dataset)
 
     dataset = Dataset.from_file(dataset)
-    loss_matrix, targets = build_loss_matrix(dataset, 2024)
+    if loss_matrix is None or targets is None:
+        loss_matrix, targets = build_loss_matrix(dataset, 2024)
 
-    bad_mask = loss_matrix.columns.isin(bad_targets)
-    keep_mask_bool = ~bad_mask
-    keep_idx = np.where(keep_mask_bool)[0]
-    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
-    targets_clean = targets[keep_idx]
-    assert loss_matrix_clean.shape[1] == targets_clean.size
+        bad_mask = loss_matrix.columns.isin(bad_targets)
+        keep_mask_bool = ~bad_mask
+        keep_idx = np.where(keep_mask_bool)[0]
+        loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+        targets_clean = targets[keep_idx]
+        assert loss_matrix_clean.shape[1] == targets_clean.size
 
     sim = Microsimulation(dataset=dataset)
 
@@ -427,10 +383,21 @@ def minimise_dataset(
 
     # Re-calibrate the final selected households to hit targets
     print("Re-calibrating final selected households...")
+
+    # Build loss matrix for the smaller dataset
+    smaller_loss_matrix, smaller_targets = build_loss_matrix(sim.dataset, 2024)
+
+    # Apply same filtering as before
+    bad_mask = smaller_loss_matrix.columns.isin(bad_targets)
+    keep_mask_bool = ~bad_mask
+    keep_idx = np.where(keep_mask_bool)[0]
+    smaller_loss_matrix_clean = smaller_loss_matrix.iloc[:, keep_idx]
+    smaller_targets_clean = smaller_targets[keep_idx]
+
     calibrated_weights = reweight(
         initial_weights,
-        loss_matrix_clean.values,  # Convert to numpy array
-        targets_clean,
+        smaller_loss_matrix_clean,  # Now matches the smaller dataset size
+        smaller_targets_clean,
         epochs=250,  # Reduced epochs for faster processing
     )
     sim.set_input("household_weight", 2024, calibrated_weights)
@@ -460,7 +427,7 @@ def minimise_dataset(
 
     for file in files:
         output_path = file.with_name(file.stem + "_minimised.h5")
-        minimise_dataset(
+        minimize_dataset(
             file,
             output_path,
         )
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
deleted file mode 100644
index 7c416e2a..00000000
--- a/test_minimization_approach.ipynb
+++ /dev/null
@@ -1,257 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "d6dc9cca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n",
-    "from policyengine_us_data.storage import STORAGE_FOLDER\n",
-    "from policyengine_us import Microsimulation\n",
-    "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
-    "from policyengine_us_data.utils import build_loss_matrix\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import h5py\n",
-    "\n",
-    "bad_targets = [\n",
-    "    \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household\",\n",
-    "    \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household\",\n",
-    "    \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n",
-    "    \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n",
-    "    \"nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household\",\n",
-    "    \"nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household\",\n",
-    "    \"nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n",
-    "    \"nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "683fd57e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Length of household entity in the dataset measured through household_weight:\n",
-    "\n",
-    "# Original ECPS 2024 dataset size: 41310\n",
-    "# Through \"random_sampling_minimization\" with 0.5 of the dataset being pruned: 20655\n",
-    "# Through \"random_sampling_minimization\" with 0.2 of the dataset being pruned: 33408\n",
-    "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change: 20655 \n",
-    "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change: 24786"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "db975ac1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## ALL TESTS\n",
-    "\n",
-    "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
-    "\n",
-    "input_dataset = ExtendedCPS_2024\n",
-    "\n",
-    "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
-    "\n",
-    "for approach in approaches:\n",
-    "    sim = Microsimulation(dataset=input_dataset)\n",
-    "    data = sim.dataset.load_dataset()\n",
-    "    data[\"household_weight\"] = {}\n",
-    "    original_weights = sim.calculate(\"household_weight\")\n",
-    "    original_weights = original_weights.values + np.random.normal(\n",
-    "        1, 0.1, len(original_weights)\n",
-    "    )\n",
-    "    for year in range(2024, 2025):\n",
-    "        loss_matrix, targets_array = build_loss_matrix(\n",
-    "            input_dataset, year\n",
-    "        )\n",
-    "\n",
-    "        bad_mask = loss_matrix.columns.isin(bad_targets)\n",
-    "        keep_mask_bool = ~bad_mask\n",
-    "        keep_idx = np.where(keep_mask_bool)[0]\n",
-    "        loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n",
-    "        targets_array_clean = targets_array[keep_idx]\n",
-    "        assert loss_matrix_clean.shape[1] == targets_array_clean.size\n",
-    "\n",
-    "        optimised_weights = reweight(\n",
-    "            original_weights,\n",
-    "            loss_matrix_clean,\n",
-    "            targets_array_clean,\n",
-    "            log_path=\"calibration_log.csv\",\n",
-    "            penalty_approach=approach,\n",
-    "            epochs=250,  # Reduced epochs for faster processing\n",
-    "        )\n",
-    "        data[\"household_weight\"][year] = optimised_weights\n",
-    "\n",
-    "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
-    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "\n",
-    "    # Save to HDF5 file\n",
-    "    with h5py.File(output_path, \"w\") as f:\n",
-    "        for variable, values in data.items():\n",
-    "            for year, value in values.items():\n",
-    "                f.create_dataset(f\"{variable}/{year}\", data=value)\n",
-    "\n",
-    "\n",
-    "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
-    "\n",
-    "files = [\n",
-    "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
-    "    ]\n",
-    "\n",
-    "approaches = {\n",
-    "        \"random_sampling_minimization\": random_sampling_minimization,\n",
-    "        \"candidate_loss_contribution\": candidate_loss_contribution,\n",
-    "}\n",
-    "\n",
-    "optional_params = {\n",
-    "        \"random_sampling_minimization\": {\n",
-    "            \"target_fractions\": [0.5, 0.6, 0.7, 0.8, 0.9],  # fractions of the dataset to keep\n",
-    "        },\n",
-    "        \"candidate_loss_contribution\": {\n",
-    "            \"loss_rel_change_max\": [0.00001, 0.000001, 0.0000001] # maximum relative change in\n",
-    "        }\n",
-    "}\n",
-    "\n",
-    "for approach, function in approaches.items():\n",
-    "    minimization_function = function\n",
-    "    # other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
-    "\n",
-    "    for params, values in optional_params[approach].items():\n",
-    "        for value in values:\n",
-    "            if params == \"target_fractions\":\n",
-    "                for file in files:\n",
-    "                    output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n",
-    "                    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "                    minimise_dataset(\n",
-    "                        file,\n",
-    "                        output_path,\n",
-    "                        minimization_function=minimization_function, \n",
-    "                        target_fractions=[value]\n",
-    "                    )\n",
-    "            elif params == \"loss_rel_change_max\":\n",
-    "                for file in files:\n",
-    "                    output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n",
-    "                    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "                    minimise_dataset(\n",
-    "                        file,\n",
-    "                        output_path,\n",
-    "                        minimization_function=minimization_function, \n",
-    "                        loss_rel_change_max=value\n",
-    "                    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "35892c9d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## SMALL CHECKS BELOW -- IGNORE ---\n",
-    "\n",
-    "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
-    "\n",
-    "files = [\n",
-    "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
-    "    ]\n",
-    "\n",
-    "minimization_function = random_sampling_minimization\n",
-    "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
-    "\n",
-    "for file in files:\n",
-    "    output_path = STORAGE_FOLDER / \"random_sampling_minimization\" / f\"{1.0}_enhanced_cps_2024_minimised.h5\"\n",
-    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "    minimise_dataset(\n",
-    "        file,\n",
-    "        output_path,\n",
-    "        minimization_function=minimization_function, \n",
-    "        target_fractions=[1.0]\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b4cf8e89",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 250/250 [01:24<00:00,  2.98it/s, loss=3.37e-5, loss_rel_change=-0.92] \n"
-     ]
-    }
-   ],
-   "source": [
-    "input_dataset = ExtendedCPS_2024\n",
-    "\n",
-    "sim = Microsimulation(dataset=input_dataset)\n",
-    "data = sim.dataset.load_dataset()\n",
-    "data[\"household_weight\"] = {}\n",
-    "original_weights = sim.calculate(\"household_weight\")\n",
-    "original_weights = original_weights.values + np.random.normal(\n",
-    "    1, 0.1, len(original_weights)\n",
-    ")\n",
-    "for year in range(2024, 2025):\n",
-    "    loss_matrix, targets_array = build_loss_matrix(\n",
-    "        input_dataset, year\n",
-    "    )\n",
-    "\n",
-    "    bad_mask = loss_matrix.columns.isin(bad_targets)\n",
-    "    keep_mask_bool = ~bad_mask\n",
-    "    keep_idx = np.where(keep_mask_bool)[0]\n",
-    "    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n",
-    "    targets_array_clean = targets_array[keep_idx]\n",
-    "    assert loss_matrix_clean.shape[1] == targets_array_clean.size\n",
-    "    assert loss_matrix_clean.shape[1] != loss_matrix.shape[1]\n",
-    "\n",
-    "    optimised_weights = reweight(\n",
-    "        original_weights,\n",
-    "        loss_matrix_clean,\n",
-    "        targets_array_clean,\n",
-    "        log_path=\"baseline_calibration_log.csv\",\n",
-    "        epochs=250,  # Reduced epochs for faster processing\n",
-    "    )\n",
-    "    data[\"household_weight\"][year] = optimised_weights\n",
-    "\n",
-    "output_path = STORAGE_FOLDER / \"baseline\" / \"enhanced_cps_2024_baseline.h5\"\n",
-    "output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "\n",
-    "# Save to HDF5 file\n",
-    "with h5py.File(output_path, \"w\") as f:\n",
-    "    for variable, values in data.items():\n",
-    "        for year, value in values.items():\n",
-    "            f.create_dataset(f\"{variable}/{year}\", data=value)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "pe",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From 9f0266578d91bdc8a682018b9c0d7b4e73f84e4b Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 11:11:49 +0200
Subject: [PATCH 42/56] load artifact in workflows

---
 .github/workflows/code_changes.yaml    | 5 +++++
 .github/workflows/pr_code_changes.yaml | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml
index b752e953..908dd887 100644
--- a/.github/workflows/code_changes.yaml
+++ b/.github/workflows/code_changes.yaml
@@ -56,6 +56,11 @@ jobs:
             with:
               name: calibration_log.csv
               path: calibration_log.csv
+          - name: Save minimized ECPS calibration log
+            uses: actions/upload-artifact@v4
+            with:
+              name: minimized_enhanced_cps_2024_calibration_log.csv
+              path: minimized_enhanced_cps_2024_calibration_log.csv
           - name: Run tests
             run: pytest
           - name: Upload data
diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 4e30d089..524d712c 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -82,6 +82,11 @@ jobs:
             with:
               name: calibration_log.csv
               path: calibration_log.csv
+          - name: Save minimized ECPS calibration log
+            uses: actions/upload-artifact@v4
+            with:
+              name: minimized_enhanced_cps_2024_calibration_log.csv
+              path: minimized_enhanced_cps_2024_calibration_log.csv
           - name: Run tests
             run: pytest
 

From fdd2e5285f8200135f652f3b8373972482437cad Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 11:20:08 +0200
Subject: [PATCH 43/56] fix importing errors

---
 policyengine_us_data/datasets/cps/enhanced_cps.py | 9 +++++----
 policyengine_us_data/utils/minimise.py            | 5 ++++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 82aa9f27..a27264d8 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -9,10 +9,6 @@
 import numpy as np
 from typing import Type
 from policyengine_us_data.storage import STORAGE_FOLDER
-from policyengine_us_data.utils.minimise import (
-    candidate_loss_contribution,
-    minimize_dataset,
-)
 from policyengine_us_data.datasets.cps.extended_cps import (
     ExtendedCPS_2024,
     CPS_2019,
@@ -343,6 +339,11 @@ def generate(self):
             targets_array_clean = targets_array[keep_idx]
             assert loss_matrix_clean.shape[1] == targets_array_clean.size
 
+        from policyengine_us_data.utils.minimise import (
+            candidate_loss_contribution,
+            minimize_dataset,
+        )
+
         minimize_dataset(
             self.input_dataset,
             self.file_path,
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 2048ce61..17461a07 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -6,7 +6,6 @@
 import h5py
 from policyengine_us_data.storage import STORAGE_FOLDER
 from typing import Optional, Callable
-from policyengine_us_data.datasets.cps.enhanced_cps import reweight
 
 bad_targets = [
     "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
@@ -141,6 +140,8 @@ def get_loss_from_mask(
     ]  # Keep as DataFrame
 
     # Call reweight function to calibrate the selected households
+    from policyengine_us_data.datasets.cps.enhanced_cps import reweight
+
     calibrated_weights_included = reweight(
         included_weights,
         included_estimate_matrix,
@@ -394,6 +395,8 @@ def minimize_dataset(
     smaller_loss_matrix_clean = smaller_loss_matrix.iloc[:, keep_idx]
     smaller_targets_clean = smaller_targets[keep_idx]
 
+    from policyengine_us_data.datasets.cps.enhanced_cps import reweight
+
     calibrated_weights = reweight(
         initial_weights,
         smaller_loss_matrix_clean,  # Now matches the smaller dataset size

From a87a0b9d12c6222e1806930b89cab236534e3763 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 11:38:33 +0200
Subject: [PATCH 44/56] fix dataset initialization error

---
 policyengine_us_data/datasets/cps/enhanced_cps.py       | 3 ++-
 policyengine_us_data/utils/{minimise.py => minimize.py} | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)
 rename policyengine_us_data/utils/{minimise.py => minimize.py} (99%)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index a27264d8..195cc173 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -305,9 +305,10 @@ def generate(self):
         self.save_dataset(data)
 
 
-class MinimizedEnhancedCPS_2024(Dataset):
+class MinimizedEnhancedCPS_2024(EnhancedCPS):
     input_dataset = ExtendedCPS_2024
     start_year = 2024
+    end_year = 2024
     name = "minimized_enhanced_cps_2024"
     label = "Minimized Enhanced CPS 2024"
     file_path = STORAGE_FOLDER / "minimized_enhanced_cps_2024.h5"
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimize.py
similarity index 99%
rename from policyengine_us_data/utils/minimise.py
rename to policyengine_us_data/utils/minimize.py
index 17461a07..6e61daff 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimize.py
@@ -419,7 +419,7 @@ def minimize_dataset(
                 f.create_dataset(f"{variable}/{year}", data=value)
 
     print(f"Saved minimised dataset to {output_path}")
-    create_calibration_log_file(output_path, epoch=500)
+    create_calibration_log_file(output_path, epoch=250)
 
 
 if __name__ == "__main__":

From 6f78752770aa7c62b8bb906dffeb398c6133331f Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 11:39:37 +0200
Subject: [PATCH 45/56] and imports

---
 policyengine_us_data/tests/test_datasets/test_enhanced_cps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
index c6660f66..7c815880 100644
--- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
+++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
@@ -268,7 +268,7 @@ def test_minimized_enhanced_cps_calibration_quality():
     - Poor (≥ 20% error): 0 points each
     """
     from policyengine_us_data.datasets.cps import MinimizedEnhancedCPS_2024
-    from policyengine_us_data.utils.minimise import create_calibration_log_file
+    from policyengine_us_data.utils.minimize import create_calibration_log_file
     from policyengine_us import Microsimulation
 
     sim = Microsimulation(dataset=MinimizedEnhancedCPS_2024)

From 9d0c9e19d651ecf9c72f91e1924141c9522472d2 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 11:47:06 +0200
Subject: [PATCH 46/56] attempting to fix data download validation error

---
 .github/workflows/pr_code_changes.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 524d712c..678d7d0d 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -71,6 +71,7 @@ jobs:
             run: make download
             env:
               HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
+              GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
           - name: Build datasets
             run: make data

From 340dc6b4b243f0fca13e368f3b7f31e27e3fcb71 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 12:07:51 +0200
Subject: [PATCH 47/56] minor bug

---
 policyengine_us_data/datasets/cps/enhanced_cps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 195cc173..ac6f01dc 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -340,7 +340,7 @@ def generate(self):
             targets_array_clean = targets_array[keep_idx]
             assert loss_matrix_clean.shape[1] == targets_array_clean.size
 
-        from policyengine_us_data.utils.minimise import (
+        from policyengine_us_data.utils.minimize import (
             candidate_loss_contribution,
             minimize_dataset,
         )

From c03eb49ecc993729d547685e745d1b852de32327 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 12:33:24 +0200
Subject: [PATCH 48/56] fix dataset path

---
 policyengine_us_data/utils/minimize.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/policyengine_us_data/utils/minimize.py b/policyengine_us_data/utils/minimize.py
index 6e61daff..199d6fb4 100644
--- a/policyengine_us_data/utils/minimize.py
+++ b/policyengine_us_data/utils/minimize.py
@@ -327,10 +327,15 @@ def minimize_dataset(
     minimization_function : function that implements the minimization logic
     **kwargs : additional arguments to pass to the minimization function
     """
-    dataset = str(dataset)
-    create_calibration_log_file(dataset)
+    # Handle both dataset class and file path
+    if hasattr(dataset, "file_path"):
+        dataset_path = str(dataset.file_path)
+    else:
+        dataset_path = str(dataset)
 
-    dataset = Dataset.from_file(dataset)
+    create_calibration_log_file(dataset_path)
+
+    dataset = Dataset.from_file(dataset_path)
     if loss_matrix is None or targets is None:
         loss_matrix, targets = build_loss_matrix(dataset, 2024)
 

From 6007db2fbdbe294e8a831fc8f622bd77278c61c7 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 13:03:12 +0200
Subject: [PATCH 49/56] fix minimize.py variables

---
 policyengine_us_data/utils/minimize.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/policyengine_us_data/utils/minimize.py b/policyengine_us_data/utils/minimize.py
index 199d6fb4..0c4d06b6 100644
--- a/policyengine_us_data/utils/minimize.py
+++ b/policyengine_us_data/utils/minimize.py
@@ -345,6 +345,9 @@ def minimize_dataset(
         loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
         targets_clean = targets[keep_idx]
         assert loss_matrix_clean.shape[1] == targets_clean.size
+    else:
+        loss_matrix_clean = loss_matrix
+        targets_clean = targets
 
     sim = Microsimulation(dataset=dataset)
 

From 171d0726d4472687a88c7a3ec50e6e02e3310452 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 14:10:43 +0200
Subject: [PATCH 50/56] change params

---
 policyengine_us_data/datasets/cps/enhanced_cps.py | 8 ++++----
 policyengine_us_data/utils/minimize.py            | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index ac6f01dc..39c93f49 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -351,9 +351,9 @@ def generate(self):
             minimization_function=candidate_loss_contribution,
             loss_matrix=loss_matrix_clean,
             targets=targets_array_clean,
-            target_fractions=[0.1],  # maximum relative change in loss
-            count_iterations=5,
-            view_fraction_per_iteration=0.5,
+            loss_rel_change_max=[0.1],  # maximum relative change in loss
+            count_iterations=6,
+            view_fraction_per_iteration=0.4,
             fraction_remove_per_iteration=0.1,
         )
 
@@ -369,5 +369,5 @@ class EnhancedCPS_2024(EnhancedCPS):
 
 
 if __name__ == "__main__":
-    EnhancedCPS_2024().generate()
+    # EnhancedCPS_2024().generate()
     MinimizedEnhancedCPS_2024().generate()
diff --git a/policyengine_us_data/utils/minimize.py b/policyengine_us_data/utils/minimize.py
index 0c4d06b6..8575470a 100644
--- a/policyengine_us_data/utils/minimize.py
+++ b/policyengine_us_data/utils/minimize.py
@@ -167,9 +167,9 @@ def candidate_loss_contribution(
     targets: np.ndarray,
     normalisation_factor: np.ndarray,
     loss_rel_change_max: float,
-    count_iterations: int = 5,
-    view_fraction_per_iteration: float = 0.3,
-    fraction_remove_per_iteration: float = 0.1,
+    count_iterations: int = 10,
+    view_fraction_per_iteration: float = 0.5,
+    fraction_remove_per_iteration: float = 0.05,
 ) -> np.ndarray:
     """
     Minimization approach based on candidate loss contribution.

From 1e235814f1d3301adea65d70b9a7a5f1e247cbb5 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 14:11:31 +0200
Subject: [PATCH 51/56] round 2

---
 policyengine_us_data/datasets/cps/enhanced_cps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 39c93f49..915b0d04 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -369,5 +369,5 @@ class EnhancedCPS_2024(EnhancedCPS):
 
 
 if __name__ == "__main__":
-    # EnhancedCPS_2024().generate()
+    EnhancedCPS_2024().generate()
     MinimizedEnhancedCPS_2024().generate()

From 8119f7ca80865cb3a0c9748555900161f1dca915 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 16:30:53 +0200
Subject: [PATCH 52/56] check if sparse l0 approach to minimizing works

---
 .../datasets/cps/enhanced_cps.py              | 364 ++++++++++++------
 .../datasets/cps/small_enhanced_cps.py        | 108 ++++++
 .../test_datasets/test_sparse_enhanced_cps.py |  85 ++++
 policyengine_us_data/utils/__init__.py        |   1 +
 policyengine_us_data/utils/l0.py              | 208 ++++++++++
 5 files changed, 653 insertions(+), 113 deletions(-)
 create mode 100644 policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py
 create mode 100644 policyengine_us_data/utils/l0.py

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 915b0d04..38e9fad0 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -1,10 +1,8 @@
 from policyengine_core.data import Dataset
 import pandas as pd
 from policyengine_us_data.utils import (
-    pe_to_soi,
-    get_soi,
     build_loss_matrix,
-    fmt,
+    HardConcrete,
 )
 import numpy as np
 from typing import Type
@@ -15,6 +13,10 @@
     CPS_2024,
 )
 import os
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
 
 try:
     import torch
@@ -41,133 +43,232 @@ def reweight(
     dropout_rate=0.05,
     epochs=500,
     log_path="calibration_log.csv",
-    penalty_approach=None,
+    l0_lambda=1e-5,
+    init_mean=0.999,
+    temperature=0.5,
+    sparse=False,
 ):
+    if loss_matrix.shape[1] == 0:
+        raise ValueError("loss_matrix has no columns after filtering")
+
+    # Store column names before converting to tensor
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
-    loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
+
+    # Keep numpy versions for final diagnostics
+    loss_matrix_numpy = loss_matrix.values
+    targets_array_numpy = np.array(targets_array)
+
+    # Convert to tensors for training
+    loss_matrix_tensor = torch.tensor(loss_matrix_numpy, dtype=torch.float32)
+    targets_array_tensor = torch.tensor(
+        targets_array_numpy, dtype=torch.float32
+    )
+
+    # Compute normalization factors
     nation_normalisation_factor = is_national * (1 / is_national.sum())
     state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
     normalisation_factor = np.where(
         is_national, nation_normalisation_factor, state_normalisation_factor
     )
-    normalisation_factor = torch.tensor(
+    normalisation_factor_tensor = torch.tensor(
         normalisation_factor, dtype=torch.float32
     )
-    targets_array = torch.tensor(targets_array, dtype=torch.float32)
+    inv_mean_normalisation = 1 / np.mean(normalisation_factor)
+
+    # Initialize weights
     weights = torch.tensor(
         np.log(original_weights), requires_grad=True, dtype=torch.float32
     )
 
-    # TO DO: replace this with a call to the python reweight.py package.
-    def loss(weights, penalty_approach=penalty_approach):
-        # Check for Nans in either the weights or the loss matrix
+    def loss(weights):
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
-        if torch.isnan(loss_matrix).any():
+        if torch.isnan(loss_matrix_tensor).any():
             raise ValueError("Loss matrix contains NaNs")
-        estimate = weights @ loss_matrix
+
+        estimate = weights @ loss_matrix_tensor
+
         if torch.isnan(estimate).any():
             raise ValueError("Estimate contains NaNs")
+
         rel_error = (
-            ((estimate - targets_array) + 1) / (targets_array + 1)
+            ((estimate - targets_array_tensor) + 1)
+            / (targets_array_tensor + 1)
         ) ** 2
-        rel_error_normalized = rel_error * normalisation_factor
+        rel_error_normalized = (
+            inv_mean_normalisation * rel_error * normalisation_factor_tensor
+        )
 
         if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
 
-        if penalty_approach is not None:
-            # L0 penalty (approximated with smooth function)
-            # Since L0 is non-differentiable, we use a smooth approximation
-            # Common approaches:
-
-            epsilon = 1e-3  # Threshold for "near zero"
-            l0_penalty_weight = 1e-1  # Adjust this hyperparameter
-
-            # Option 1: Sigmoid approximation
-            if penalty_approach == "l0_sigmoid":
-                smoothed_l0 = torch.sigmoid(
-                    (weights - epsilon) / (epsilon * 0.1)
-                ).mean()
-
-            # Option 2: Log-sum penalty (smoother)
-            if penalty_approach == "l0_log":
-                smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(
-                    weights
-                )
-
-            # Option 3: Exponential penalty
-            if penalty_approach == "l0_exp":
-                smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
-
-            # L1 penalty
-            l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
-
-            if penalty_approach == "l1":
-                l1 = torch.mean(weights)
-                return rel_error_normalized.mean() + l1_penalty_weight * l1
-
-            return (
-                rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
-            )
-
-        else:
-            return rel_error_normalized.mean()
+        return rel_error_normalized.mean()
 
     def dropout_weights(weights, p):
         if p == 0:
             return weights
-        # Replace p% of the weights with the mean value of the rest of them
         mask = torch.rand_like(weights) < p
         mean = weights[~mask].mean()
         masked_weights = weights.clone()
         masked_weights[mask] = mean
         return masked_weights
 
-    optimizer = torch.optim.Adam([weights], lr=3e-1)
-    from tqdm import trange
-
-    start_loss = None
-
-    iterator = trange(epochs)
-    performance = pd.DataFrame()
-    for i in iterator:
-        optimizer.zero_grad()
-        weights_ = dropout_weights(weights, dropout_rate)
-        l = loss(torch.exp(weights_))
-        if (log_path is not None) and (i % 10 == 0):
-            estimates = torch.exp(weights) @ loss_matrix
-            estimates = estimates.detach().numpy()
-            df = pd.DataFrame(
-                {
-                    "target_name": target_names,
-                    "estimate": estimates,
-                    "target": targets_array.detach().numpy(),
-                }
+    def compute_diagnostics(final_weights, label=""):
+        """Helper function to compute and log diagnostics"""
+        estimate = final_weights @ loss_matrix_numpy
+        rel_error = (
+            ((estimate - targets_array_numpy) + 1) / (targets_array_numpy + 1)
+        ) ** 2
+        within_10_percent_mask = np.abs(estimate - targets_array_numpy) <= (
+            0.10 * np.abs(targets_array_numpy)
+        )
+        percent_within_10 = np.mean(within_10_percent_mask) * 100
+
+        logger.info(
+            f"\n\n---{label} Solutions: reweighting quick diagnostics----\n"
+        )
+        logger.info(
+            f"{np.sum(final_weights == 0)} are zero, {np.sum(final_weights != 0)} weights are nonzero"
+        )
+        logger.info(
+            f"rel_error: min: {np.min(rel_error):.2f}\n"
+            f"max: {np.max(rel_error):.2f}\n"
+            f"mean: {np.mean(rel_error):.2f}\n"
+            f"median: {np.median(rel_error):.2f}\n"
+            f"Within 10% of target: {percent_within_10:.2f}%"
+        )
+        logger.info("Relative error over 100% for:")
+        for i in np.where(rel_error > 1)[0]:
+            logger.info(f"target_name: {target_names[i]}")
+            logger.info(f"target_value: {targets_array_numpy[i]}")
+            logger.info(f"estimate_value: {estimate[i]}")
+            logger.info(f"has rel_error: {rel_error[i]:.2f}\n")
+        logger.info("---End of reweighting quick diagnostics------")
+
+    if not sparse:
+        # Dense training
+        optimizer = torch.optim.Adam([weights], lr=3e-1)
+        from tqdm import trange
+
+        start_loss = None
+        iterator = trange(epochs)
+        performance = pd.DataFrame()
+
+        for i in iterator:
+            optimizer.zero_grad()
+            weights_ = dropout_weights(weights, dropout_rate)
+            l = loss(torch.exp(weights_))
+
+            if (log_path is not None) and (i % 10 == 0):
+                with torch.no_grad():
+                    estimates = (
+                        torch.exp(weights) @ loss_matrix_tensor
+                    ).numpy()
+                df = pd.DataFrame(
+                    {
+                        "target_name": target_names,
+                        "estimate": estimates,
+                        "target": targets_array_numpy,
+                    }
+                )
+                df["epoch"] = i
+                df["error"] = df.estimate - df.target
+                df["rel_error"] = df.error / df.target
+                df["abs_error"] = df.error.abs()
+                df["rel_abs_error"] = df.rel_error.abs()
+                df["loss"] = df.rel_abs_error**2
+                performance = pd.concat([performance, df], ignore_index=True)
+
+            if (log_path is not None) and (i % 1000 == 0):
+                performance.to_csv(log_path, index=False)
+
+            if start_loss is None:
+                start_loss = l.item()
+            loss_rel_change = (l.item() - start_loss) / start_loss
+
+            l.backward()
+            iterator.set_postfix(
+                {"loss": l.item(), "loss_rel_change": loss_rel_change}
             )
-            df["epoch"] = i
-            df["error"] = df.estimate - df.target
-            df["rel_error"] = df.error / df.target
-            df["abs_error"] = df.error.abs()
-            df["rel_abs_error"] = df.rel_error.abs()
-            df["loss"] = df.rel_abs_error**2
-            performance = pd.concat([performance, df], ignore_index=True)
-
-        if (log_path is not None) and (i % 1000 == 0):
+            optimizer.step()
+
+        if log_path is not None:
             performance.to_csv(log_path, index=False)
-        if start_loss is None:
-            start_loss = l.item()
-        loss_rel_change = (l.item() - start_loss) / start_loss
-        l.backward()
-        iterator.set_postfix(
-            {"loss": l.item(), "loss_rel_change": loss_rel_change}
+
+        final_weights_dense = torch.exp(weights).detach().numpy()
+        compute_diagnostics(final_weights_dense, "Dense")
+        return final_weights_dense
+
+    else:
+        # Sparse training
+        weights = torch.tensor(
+            np.log(original_weights), requires_grad=True, dtype=torch.float32
         )
-        optimizer.step()
+        gates = HardConcrete(
+            len(original_weights), init_mean=init_mean, temperature=temperature
+        )
+
+        optimizer = torch.optim.Adam(
+            [weights] + list(gates.parameters()), lr=3e-1
+        )
+        from tqdm import trange
+
+        start_loss = None
+        iterator = trange(epochs)
+        performance = pd.DataFrame()
+
+        for i in iterator:
+            optimizer.zero_grad()
+            weights_ = dropout_weights(weights, dropout_rate)
+            masked = torch.exp(weights_) * gates()
+            l_main = loss(masked)
+            l = l_main + l0_lambda * gates.get_penalty()
+
+            if (log_path is not None) and (i % 10 == 0):
+                gates.eval()
+                with torch.no_grad():
+                    estimates = (
+                        (torch.exp(weights) * gates()) @ loss_matrix_tensor
+                    ).numpy()
+                gates.train()
+
+                df = pd.DataFrame(
+                    {
+                        "target_name": target_names,
+                        "estimate": estimates,
+                        "target": targets_array_numpy,
+                    }
+                )
+                df["epoch"] = i
+                df["error"] = df.estimate - df.target
+                df["rel_error"] = df.error / df.target
+                df["abs_error"] = df.error.abs()
+                df["rel_abs_error"] = df.rel_error.abs()
+                df["loss"] = df.rel_abs_error**2
+                performance = pd.concat([performance, df], ignore_index=True)
+
+            if (log_path is not None) and (i % 1000 == 0):
+                performance.to_csv(log_path, index=False)
+
+            if start_loss is None:
+                start_loss = l.item()
+            loss_rel_change = (l.item() - start_loss) / start_loss
+
+            l.backward()
+            iterator.set_postfix(
+                {"loss": l.item(), "loss_rel_change": loss_rel_change}
+            )
+            optimizer.step()
+
         if log_path is not None:
             performance.to_csv(log_path, index=False)
 
-    return torch.exp(weights).detach().numpy()
+        gates.eval()
+        final_weights_sparse = (torch.exp(weights) * gates()).detach().numpy()
+        compute_diagnostics(final_weights_sparse, "Sparse")
+
+        return final_weights_sparse
 
 
 def train_previous_year_income_model():
@@ -253,26 +354,6 @@ def generate(self):
             )
             data["household_weight"][year] = optimised_weights
 
-            print("\n\n---reweighting quick diagnostics----\n")
-            estimate = optimised_weights @ loss_matrix_clean
-            rel_error = (
-                ((estimate - targets_array_clean) + 1)
-                / (targets_array_clean + 1)
-            ) ** 2
-            print(
-                f"rel_error: min: {np.min(rel_error):.2f}, "
-                f"max: {np.max(rel_error):.2f} "
-                f"mean: {np.mean(rel_error):.2f}, "
-                f"median: {np.median(rel_error):.2f}"
-            )
-            print("Relative error over 100% for:")
-            for i in np.where(rel_error > 1)[0]:
-                print(f"target_name: {loss_matrix_clean.columns[i]}")
-                print(f"target_value: {targets_array_clean[i]}")
-                print(f"estimate_value: {estimate[i]}")
-                print(f"has rel_error: {rel_error[i]:.2f}\n")
-            print("---End of reweighting quick diagnostics------")
-
         self.save_dataset(data)
 
 
@@ -336,8 +417,18 @@ def generate(self):
             bad_mask = loss_matrix.columns.isin(bad_targets)
             keep_mask_bool = ~bad_mask
             keep_idx = np.where(keep_mask_bool)[0]
-            loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
-            targets_array_clean = targets_array[keep_idx]
+
+            # Check if filtering would remove all columns
+            if len(keep_idx) == 0:
+                print(
+                    "WARNING: bad_targets filtering would remove all columns, using all columns instead"
+                )
+                keep_idx = np.arange(loss_matrix.shape[1])
+                targets_array_clean = targets_array
+                loss_matrix_clean = loss_matrix
+            else:
+                loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+                targets_array_clean = targets_array[keep_idx]
             assert loss_matrix_clean.shape[1] == targets_array_clean.size
 
         from policyengine_us_data.utils.minimize import (
@@ -358,6 +449,52 @@ def generate(self):
         )
 
 
+class SparseEnhancedCPS_2024(EnhancedCPS):
+    input_dataset = ExtendedCPS_2024
+    start_year = 2024
+    end_year = 2024
+    name = "sparse_enhanced_cps_2024"
+    label = "Sparse Enhanced CPS 2024"
+    file_path = STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5"
+    url = "hf://policyengine/policyengine-us-data/sparse_enhanced_cps_2024.h5"
+
+    def generate(self):
+        from policyengine_us import Microsimulation
+
+        sim = Microsimulation(dataset=self.input_dataset)
+        data = sim.dataset.load_dataset()
+        data["household_weight"] = {}
+        original_weights = sim.calculate("household_weight")
+        original_weights = original_weights.values + np.random.normal(
+            1, 0.1, len(original_weights)
+        )
+
+        # Run the optimization procedure to get (close to) minimum loss weights
+        for year in range(self.start_year, self.end_year + 1):
+            loss_matrix, targets_array = build_loss_matrix(
+                self.input_dataset, year
+            )
+
+            bad_mask = loss_matrix.columns.isin(bad_targets)
+            keep_mask_bool = ~bad_mask
+            keep_idx = np.where(keep_mask_bool)[0]
+            loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+            targets_array_clean = targets_array[keep_idx]
+            assert loss_matrix_clean.shape[1] == targets_array_clean.size
+
+            optimised_weights = reweight(
+                original_weights,
+                loss_matrix_clean,
+                targets_array_clean,
+                log_path="calibration_log.csv",
+                epochs=150,
+                sparse=True,
+            )
+            data["household_weight"][year] = optimised_weights
+
+        self.save_dataset(data)
+
+
 class EnhancedCPS_2024(EnhancedCPS):
     input_dataset = ExtendedCPS_2024
     start_year = 2024
@@ -369,5 +506,6 @@ class EnhancedCPS_2024(EnhancedCPS):
 
 
 if __name__ == "__main__":
-    EnhancedCPS_2024().generate()
-    MinimizedEnhancedCPS_2024().generate()
+    # EnhancedCPS_2024().generate()
+    # MinimizedEnhancedCPS_2024().generate()
+    SparseEnhancedCPS_2024().generate()
diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py
index 976725d9..9e8d697c 100644
--- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py
@@ -1,5 +1,8 @@
+import pandas as pd
 import numpy as np
 
+from policyengine_core.data.dataset import Dataset
+
 
 def create_small_ecps():
     from policyengine_us import Microsimulation
@@ -37,6 +40,111 @@ def create_small_ecps():
                 grp.create_dataset(str(period), data=values)
 
 
+def create_sparse_ecps():
+    from policyengine_us import Microsimulation
+    from policyengine_us_data.datasets import SparseEnhancedCPS_2024
+    from policyengine_us_data.storage import STORAGE_FOLDER
+    from policyengine_core.enums import Enum
+
+    time_period = 2024
+
+    ecps = SparseEnhancedCPS_2024()
+    h5 = ecps.load()
+    sparse_weights = h5["household_sparse_weight"]["2024"][:]
+    hh_ids = h5["household_id"]["2024"][:]
+
+    template_sim = Microsimulation(
+        dataset=EnhancedCPS_2024,
+    )
+    template_sim.set_input("household_weight", 2024, sparse_weights)
+
+    template_df = template_sim.to_input_dataframe()
+
+    household_weight_column = f"household_weight__{time_period}"
+    df_household_id_column = f"household_id__{time_period}"
+    df_person_id_column = f"person_id__{time_period}"
+
+    # Group by household ID and get the first entry for each group
+    df = template_df
+    h_df = df.groupby(df_household_id_column).first()
+    h_ids = pd.Series(h_df.index)
+    h_weights = pd.Series(h_df[household_weight_column].values)
+
+    # Seed the random number generators for reproducibility
+    h_ids = h_ids[h_weights > 0]
+    h_weights = h_weights[h_weights > 0]
+
+    subset_df = df[df[df_household_id_column].isin(h_ids)].copy()
+
+    household_id_to_count = {}
+    for household_id in h_ids:
+        if household_id not in household_id_to_count:
+            household_id_to_count[household_id] = 0
+        household_id_to_count[household_id] += 1
+
+    household_counts = subset_df[df_household_id_column].map(
+        lambda x: household_id_to_count.get(x, 0)
+    )
+
+    # NOTE: from subsample. I don't think I want to do this!
+    ## Adjust household weights to maintain the total weight
+    # for col in subset_df.columns:
+    #    if "weight__" in col:
+    #        target_total_weight = df[col].values.sum()
+    #        if not quantize_weights:
+    #            subset_df[col] *= household_counts.values
+    #        else:
+    #            subset_df[col] = household_counts.values
+    #        subset_df[col] *= (
+    #            target_total_weight / subset_df[col].values.sum()
+    #        )
+
+    df = subset_df
+
+    # Update the dataset and rebuild the simulation
+    sim = Microsimulation()
+    sim.dataset = Dataset.from_dataframe(df, sim.dataset.time_period)
+    sim.build_from_dataset()
+
+    # Ensure the baseline branch has the new data.
+    if "baseline" in sim.branches:
+        baseline_tax_benefit_system = sim.branches[
+            "baseline"
+        ].tax_benefit_system
+        sim.branches["baseline"] = sim.clone()
+        sim.branches["tax_benefit_system"] = baseline_tax_benefit_system
+
+    sim.default_calculation_period = time_period
+
+    # Get ready to write it out
+    simulation = sim
+    data = {}
+    for variable in simulation.tax_benefit_system.variables:
+        data[variable] = {}
+        for time_period in simulation.get_holder(variable).get_known_periods():
+            values = simulation.get_holder(variable).get_array(time_period)
+            values = np.array(values)
+            if simulation.tax_benefit_system.variables.get(
+                variable
+            ).value_type in (Enum, str):
+                values = values.astype("S")
+            if values is not None:
+                data[variable][time_period] = values
+
+        if len(data[variable]) == 0:
+            del data[variable]
+
+    import h5py
+
+    with h5py.File(STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5", "w") as f:
+        for variable, periods in data.items():
+            grp = f.create_group(variable)
+            for period, values in periods.items():
+                grp.create_dataset(str(period), data=values)
+
+
 if __name__ == "__main__":
     create_small_ecps()
     print("Small CPS dataset created successfully.")
+    create_sparse_ecps()
+    print("Sparse CPS dataset created successfully.")
diff --git a/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py
new file mode 100644
index 00000000..b807c1ef
--- /dev/null
+++ b/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py
@@ -0,0 +1,85 @@
+import pytest
+
+import numpy as np
+
+from policyengine_us_data.utils import build_loss_matrix
+
+
+def test_sparse_ecps():
+    from policyengine_core.data import Dataset
+    from policyengine_us_data.storage import STORAGE_FOLDER
+    from policyengine_us import Microsimulation
+
+    # NOTE: replace with "small_enhanced_cps_2024.h5 to see the difference!
+    sim = Microsimulation(
+        dataset=Dataset.from_file(
+            STORAGE_FOLDER / f"sparse_enhanced_cps_2024.h5",
+        )
+    )
+
+    data = sim.dataset.load_dataset()
+    bad_targets = [
+        "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+        "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+        "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+        "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+        "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+        "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+        "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+        "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+        "state/RI/adjusted_gross_income/amount/-inf_1",
+        "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+        "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+        "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+        "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+        "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+        "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+        "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+        "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+        "state/RI/adjusted_gross_income/amount/-inf_1",
+        "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All",
+    ]
+
+    year = 2024
+    loss_matrix, targets_array = build_loss_matrix(sim.dataset, year)
+    zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
+    bad_mask = loss_matrix.columns.isin(bad_targets)
+    keep_mask_bool = ~(zero_mask | bad_mask)
+    keep_idx = np.where(keep_mask_bool)[0]
+    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+    targets_array_clean = targets_array[keep_idx]
+    assert loss_matrix_clean.shape[1] == targets_array_clean.size
+
+    optimised_weights = data["household_weight"]["2024"]
+    print("\n\n---Sparse Solutions: reweighting quick diagnostics----\n")
+    print(
+        f"{np.sum(optimised_weights == 0)} are zero, {np.sum(optimised_weights != 0)} weights are nonzero"
+    )
+    estimate = optimised_weights @ loss_matrix_clean
+    rel_error = (
+        ((estimate - targets_array_clean) + 1) / (targets_array_clean + 1)
+    ) ** 2
+    within_10_percent_mask = np.abs(estimate - targets_array_clean) <= (
+        0.10 * np.abs(targets_array_clean)
+    )
+    percent_within_10 = np.mean(within_10_percent_mask) * 100
+    print(
+        f"rel_error: min: {np.min(rel_error):.2f}\n"
+        f"max: {np.max(rel_error):.2f}\n"
+        f"mean: {np.mean(rel_error):.2f}\n"
+        f"median: {np.median(rel_error):.2f}\n"
+        f"Wthin 10% of target: {percent_within_10:.2f}%"
+    )
+    print("Relative error over 100% for:")
+    for i in np.where(rel_error > 1)[0]:
+        print(f"target_name: {loss_matrix_clean.columns[i]}")
+        print(f"target_value: {targets_array_clean[i]}")
+        print(f"estimate_value: {estimate[i]}")
+        print(f"has rel_error: {rel_error[i]:.2f}\n")
+    print("---End of reweighting quick diagnostics------")
+
+    assert percent_within_10 > 70.0
+
+
+if __name__ == "main":
+    test_sparse_ecps()
diff --git a/policyengine_us_data/utils/__init__.py b/policyengine_us_data/utils/__init__.py
index d25c6c2f..136d2503 100644
--- a/policyengine_us_data/utils/__init__.py
+++ b/policyengine_us_data/utils/__init__.py
@@ -3,3 +3,4 @@
 from .uprating import *
 from .loss import *
 from .qrf import *
+from .l0 import *
diff --git a/policyengine_us_data/utils/l0.py b/policyengine_us_data/utils/l0.py
new file mode 100644
index 00000000..ebd89d0a
--- /dev/null
+++ b/policyengine_us_data/utils/l0.py
@@ -0,0 +1,208 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class HardConcrete(nn.Module):
+    """HardConcrete distribution for L0 regularization."""
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim=None,
+        temperature=0.5,
+        stretch=0.1,
+        init_mean=0.5,
+    ):
+        super().__init__()
+        if output_dim is None:
+            self.gate_size = (input_dim,)
+        else:
+            self.gate_size = (input_dim, output_dim)
+        self.qz_logits = nn.Parameter(torch.zeros(self.gate_size))
+        self.temperature = temperature
+        self.stretch = stretch
+        self.gamma = -0.1
+        self.zeta = 1.1
+        self.init_mean = init_mean
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        if self.init_mean is not None:
+            init_val = math.log(self.init_mean / (1 - self.init_mean))
+            self.qz_logits.data.fill_(init_val)
+
+    def forward(self, input_shape=None):
+        if self.training:
+            gates = self._sample_gates()
+        else:
+            gates = self._deterministic_gates()
+        if input_shape is not None and len(input_shape) > len(gates.shape):
+            gates = gates.unsqueeze(-1).unsqueeze(-1)
+        return gates
+
+    def _sample_gates(self):
+        u = torch.zeros_like(self.qz_logits).uniform_(1e-8, 1.0 - 1e-8)
+        s = torch.log(u) - torch.log(1 - u) + self.qz_logits
+        s = torch.sigmoid(s / self.temperature)
+        s = s * (self.zeta - self.gamma) + self.gamma
+        gates = torch.clamp(s, 0, 1)
+        return gates
+
+    def _deterministic_gates(self):
+        probs = torch.sigmoid(self.qz_logits)
+        gates = probs * (self.zeta - self.gamma) + self.gamma
+        return torch.clamp(gates, 0, 1)
+
+    def get_penalty(self):
+        logits_shifted = self.qz_logits - self.temperature * math.log(
+            -self.gamma / self.zeta
+        )
+        prob_active = torch.sigmoid(logits_shifted)
+        return prob_active.sum()
+
+    def get_active_prob(self):
+        logits_shifted = self.qz_logits - self.temperature * math.log(
+            -self.gamma / self.zeta
+        )
+        return torch.sigmoid(logits_shifted)
+
+
+class L0Linear(nn.Module):
+    """Linear layer with L0 regularization using HardConcrete gates."""
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=True,
+        temperature=0.5,
+        init_sparsity=0.5,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_features))
+        else:
+            self.register_parameter("bias", None)
+        self.weight_gates = HardConcrete(
+            out_features,
+            in_features,
+            temperature=temperature,
+            init_mean=init_sparsity,
+        )
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.kaiming_normal_(self.weight, mode="fan_out")
+        if self.bias is not None:
+            nn.init.zeros_(self.bias)
+
+    def forward(self, input):
+        gates = self.weight_gates()
+        masked_weight = self.weight * gates
+        return F.linear(input, masked_weight, self.bias)
+
+    def get_l0_penalty(self):
+        return self.weight_gates.get_penalty()
+
+    def get_sparsity(self):
+        with torch.no_grad():
+            prob_active = self.weight_gates.get_active_prob()
+            return 1.0 - prob_active.mean().item()
+
+
+class SparseMLP(nn.Module):
+    """Example MLP with L0 regularization on all layers"""
+
+    def __init__(
+        self,
+        input_dim=784,
+        hidden_dim=256,
+        output_dim=10,
+        init_sparsity=0.5,
+        temperature=0.5,
+    ):
+        super().__init__()
+        self.fc1 = L0Linear(
+            input_dim,
+            hidden_dim,
+            init_sparsity=init_sparsity,
+            temperature=temperature,
+        )
+        self.fc2 = L0Linear(
+            hidden_dim,
+            hidden_dim,
+            init_sparsity=init_sparsity,
+            temperature=temperature,
+        )
+        self.fc3 = L0Linear(
+            hidden_dim,
+            output_dim,
+            init_sparsity=init_sparsity,
+            temperature=temperature,
+        )
+
+    def forward(self, x):
+        x = x.view(x.size(0), -1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+    def get_l0_loss(self):
+        l0_loss = 0
+        for module in self.modules():
+            if isinstance(module, L0Linear):
+                l0_loss += module.get_l0_penalty()
+        return l0_loss
+
+    def get_sparsity_stats(self):
+        stats = {}
+        for name, module in self.named_modules():
+            if isinstance(module, L0Linear):
+                stats[name] = {
+                    "sparsity": module.get_sparsity(),
+                    "active_params": module.get_l0_penalty().item(),
+                }
+        return stats
+
+
+def train_with_l0(model, train_loader, epochs=10, l0_lambda=1e-3):
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    criterion = nn.CrossEntropyLoss()
+    for epoch in range(epochs):
+        total_loss = 0
+        total_l0 = 0
+        for batch_idx, (data, target) in enumerate(train_loader):
+            optimizer.zero_grad()
+            output = model(data)
+            ce_loss = criterion(output, target)
+            l0_loss = model.get_l0_loss()
+            loss = ce_loss + l0_lambda * l0_loss
+            loss.backward()
+            optimizer.step()
+            total_loss += ce_loss.item()
+            total_l0 += l0_loss.item()
+        if epoch % 1 == 0:
+            sparsity_stats = model.get_sparsity_stats()
+            print(
+                f"Epoch {epoch}: Loss={total_loss/len(train_loader):.4f}, L0={total_l0/len(train_loader):.4f}"
+            )
+            for layer, stats in sparsity_stats.items():
+                print(
+                    f"  {layer}: {stats['sparsity']*100:.1f}% sparse, {stats['active_params']:.1f} active params"
+                )
+
+
+def prune_model(model, threshold=0.05):
+    for module in model.modules():
+        if isinstance(module, L0Linear):
+            with torch.no_grad():
+                prob_active = module.weight_gates.get_active_prob()
+                mask = (prob_active > threshold).float()
+                module.weight.data *= mask
+    return model

From 5ae89d5a515ca08113a8202928011008fcb32871 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 16:57:51 +0200
Subject: [PATCH 53/56] update datasets to be generated

---
 policyengine_us_data/datasets/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/policyengine_us_data/datasets/__init__.py b/policyengine_us_data/datasets/__init__.py
index 87461837..c0f2c8fd 100644
--- a/policyengine_us_data/datasets/__init__.py
+++ b/policyengine_us_data/datasets/__init__.py
@@ -14,6 +14,8 @@
     CensusCPS_2023,
     EnhancedCPS_2024,
     ReweightedCPS_2024,
+    MinimizedEnhancedCPS_2024,
+    SparseEnhancedCPS_2024,
 )
 from .puf import PUF_2015, PUF_2021, PUF_2024, IRS_PUF_2015
 from .acs import ACS_2022

From 0521be64c5faf3736e24dfb8e4f29109d8bfb1d4 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 17:17:32 +0200
Subject: [PATCH 54/56] try adding logic to generate the sparse ecps if missing

---
 .../datasets/cps/enhanced_cps.py                |  4 ++++
 .../datasets/cps/small_enhanced_cps.py          | 17 ++++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 38e9fad0..d246a89d 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -491,6 +491,10 @@ def generate(self):
                 sparse=True,
             )
             data["household_weight"][year] = optimised_weights
+            # Also save as sparse weights for small_enhanced_cps.py
+            if "household_sparse_weight" not in data:
+                data["household_sparse_weight"] = {}
+            data["household_sparse_weight"][year] = optimised_weights
 
         self.save_dataset(data)
 
diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py
index 9e8d697c..db13b770 100644
--- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py
@@ -49,9 +49,20 @@ def create_sparse_ecps():
     time_period = 2024
 
     ecps = SparseEnhancedCPS_2024()
-    h5 = ecps.load()
-    sparse_weights = h5["household_sparse_weight"]["2024"][:]
-    hh_ids = h5["household_id"]["2024"][:]
+
+    # Check if sparse weights exist, if not generate them
+    try:
+        h5 = ecps.load()
+        sparse_weights = h5["household_sparse_weight"]["2024"][:]
+        hh_ids = h5["household_id"]["2024"][:]
+    except KeyError:
+        print(
+            "Sparse weights not found. Generating SparseEnhancedCPS_2024 dataset..."
+        )
+        ecps.generate()
+        h5 = ecps.load()
+        sparse_weights = h5["household_sparse_weight"]["2024"][:]
+        hh_ids = h5["household_id"]["2024"][:]
 
     template_sim = Microsimulation(
         dataset=EnhancedCPS_2024,

From 1d38077b0ee938709647acf4626dc6b24ed72a12 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 17:35:44 +0200
Subject: [PATCH 55/56] make saving minimized ECPS optional

---
 .github/workflows/code_changes.yaml               | 1 +
 .github/workflows/pr_code_changes.yaml            | 1 +
 policyengine_us_data/datasets/cps/enhanced_cps.py | 7 ++++++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml
index 908dd887..6c619b40 100644
--- a/.github/workflows/code_changes.yaml
+++ b/.github/workflows/code_changes.yaml
@@ -61,6 +61,7 @@ jobs:
             with:
               name: minimized_enhanced_cps_2024_calibration_log.csv
               path: minimized_enhanced_cps_2024_calibration_log.csv
+              if-no-files-found: ignore
           - name: Run tests
             run: pytest
           - name: Upload data
diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 678d7d0d..4c2d6cbf 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -88,6 +88,7 @@ jobs:
             with:
               name: minimized_enhanced_cps_2024_calibration_log.csv
               path: minimized_enhanced_cps_2024_calibration_log.csv
+              if-no-files-found: ignore
           - name: Run tests
             run: pytest
 
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index d246a89d..984308bc 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -460,6 +460,9 @@ class SparseEnhancedCPS_2024(EnhancedCPS):
 
     def generate(self):
         from policyengine_us import Microsimulation
+        from policyengine_us_data.utils.minimize import (
+            create_calibration_log_file,
+        )
 
         sim = Microsimulation(dataset=self.input_dataset)
         data = sim.dataset.load_dataset()
@@ -498,6 +501,8 @@ def generate(self):
 
         self.save_dataset(data)
 
+        create_calibration_log_file(self.file_path)
+
 
 class EnhancedCPS_2024(EnhancedCPS):
     input_dataset = ExtendedCPS_2024
@@ -510,6 +515,6 @@ class EnhancedCPS_2024(EnhancedCPS):
 
 
 if __name__ == "__main__":
-    # EnhancedCPS_2024().generate()
+    EnhancedCPS_2024().generate()
     # MinimizedEnhancedCPS_2024().generate()
     SparseEnhancedCPS_2024().generate()

From 6bb0fb17dcd8bb017810f236c1d82a9e54ddaf86 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Tue, 15 Jul 2025 18:10:25 +0200
Subject: [PATCH 56/56] reducing iterations hoping jobs dont get killed

---
 policyengine_us_data/utils/minimize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_us_data/utils/minimize.py b/policyengine_us_data/utils/minimize.py
index 8575470a..ce2c6fdf 100644
--- a/policyengine_us_data/utils/minimize.py
+++ b/policyengine_us_data/utils/minimize.py
@@ -167,7 +167,7 @@ def candidate_loss_contribution(
     targets: np.ndarray,
     normalisation_factor: np.ndarray,
     loss_rel_change_max: float,
-    count_iterations: int = 10,
+    count_iterations: int = 5,
     view_fraction_per_iteration: float = 0.5,
     fraction_remove_per_iteration: float = 0.05,
 ) -> np.ndarray: