From 4c0f1c611e85436fe5b1c0e1c87deb386846d761 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Thu, 10 Jul 2025 21:20:39 +0100
Subject: [PATCH 01/58] Shrink datasets

---
 policyengine_us_data/utils/minimise.py | 85 ++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 policyengine_us_data/utils/minimise.py

diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
new file mode 100644
index 00000000..4355e889
--- /dev/null
+++ b/policyengine_us_data/utils/minimise.py
@@ -0,0 +1,85 @@
+from policyengine_us_data.utils.loss import build_loss_matrix
+from policyengine_core.data import Dataset
+from policyengine_us import Microsimulation
+import numpy as np
+import pandas as pd
+
+def minimise_dataset(dataset, output_path: str, loss_rel_change_max: float) -> None:
+    # if loading from a .h5 file, need to do dataset = Dataset.from_file(dataset)
+    loss_matrix = build_loss_matrix(dataset, 2024)
+
+    sim = Microsimulation(dataset=dataset)
+
+    weights = sim.calculate("household_weight", 2024).values
+    estimate_matrix, targets = loss_matrix
+    is_national = estimate_matrix.columns.str.startswith("nation/")
+    nation_normalisation_factor = is_national * (1 / is_national.sum())
+    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
+    normalisation_factor = np.where(
+        is_national, nation_normalisation_factor, state_normalisation_factor
+    )
+    weights @ estimate_matrix
+
+    def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor):
+        """
+        Calculate the loss based on the inclusion mask and the estimate matrix.
+        """
+        masked_weights = weights.copy()
+        original_weight_total = masked_weights.sum()
+        masked_weights[~inclusion_mask] = 0
+        masked_weight_total = masked_weights.sum()
+        masked_weights[inclusion_mask] *= original_weight_total / masked_weight_total
+        estimates = masked_weights @ estimate_matrix
+        rel_error = ((estimates - targets) + 1) / (targets + 1)
+        loss = ((rel_error * normalisation_factor) ** 2).mean()
+
+        return loss
+
+    COUNT_ITERATIONS = 5
+    FRACTION_REMOVE_PER_ITERATION = 0.1
+    from tqdm import tqdm
+
+    full_mask = np.ones_like(weights, dtype=bool)
+    for i in range(COUNT_ITERATIONS):
+        inclusion_mask = full_mask.copy()
+        baseline_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor)
+        household_loss_rel_changes = []
+        for household_index in tqdm(range(len(weights))):
+            # Skip if this household is already excluded
+            if not inclusion_mask[household_index]:
+                household_loss_rel_changes.append(np.inf)
+                continue
+            # Calculate loss if this household is removed
+            inclusion_mask = inclusion_mask.copy()
+            inclusion_mask[household_index] = False
+            loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor)
+            rel_change = (loss - baseline_loss) / baseline_loss
+            household_loss_rel_changes.append(rel_change)
+        inclusion_mask = full_mask.copy()
+        household_loss_rel_changes = np.array(household_loss_rel_changes)
+        # Sort by the relative change in loss
+        sorted_indices = np.argsort(household_loss_rel_changes)
+        # Remove the worst households
+        num_to_remove = int(len(weights) * FRACTION_REMOVE_PER_ITERATION)
+        worst_indices = sorted_indices[:num_to_remove]
+        inclusion_mask[worst_indices] = False
+        # Calculate the new loss
+        new_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor)
+        print(f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}")
+        print(f"Removed {num_to_remove} households with worst relative loss changes.")
+        # Update the full mask
+        full_mask &= inclusion_mask
+    
+    household_ids = sim.calculate("household_id", 2024).values
+    remaining_households = household_ids[full_mask]
+
+    # At this point we have a mask of households to keep
+
+    # I'm saving to a csv for ease of debugging, but we need to save to a .h5 file
+
+    df = sim.to_input_dataframe()
+    df = df[df["household_id__2024"].isin(remaining_households)]
+
+    df.to_csv(output_path, index=False)
+
+    return df
\ No newline at end of file

From 6b2a56f6f8a55aacb4ee9e305bd53c74f36c70b0 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Thu, 10 Jul 2025 22:25:27 +0100
Subject: [PATCH 02/58] Move to package

---
 Makefile                                      |   1 +
 .../storage/upload_completed_datasets.py      |   1 +
 policyengine_us_data/utils/minimise.py        | 127 +++++++++++++++---
 3 files changed, 114 insertions(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index 788ba1d3..90b2817a 100644
--- a/Makefile
+++ b/Makefile
@@ -46,6 +46,7 @@ data:
 	python policyengine_us_data/datasets/cps/extended_cps.py
 	python policyengine_us_data/datasets/cps/enhanced_cps.py
 	python policyengine_us_data/datasets/cps/small_enhanced_cps.py
+	python policyengine_us_data/utils/minimise.py
 
 clean:
 	rm -f policyengine_us_data/storage/*.h5
diff --git a/policyengine_us_data/storage/upload_completed_datasets.py b/policyengine_us_data/storage/upload_completed_datasets.py
index f161a9ee..16885d8c 100644
--- a/policyengine_us_data/storage/upload_completed_datasets.py
+++ b/policyengine_us_data/storage/upload_completed_datasets.py
@@ -15,6 +15,7 @@ def upload_datasets():
         Pooled_3_Year_CPS_2023.file_path,
         CPS_2023.file_path,
         STORAGE_FOLDER / "small_enhanced_cps_2024.h5",
+        STORAGE_FOLDER / "enhanced_cps_2024_minified.h5",
     ]
 
     for file_path in dataset_files:
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 4355e889..6fe511fd 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -3,9 +3,43 @@
 from policyengine_us import Microsimulation
 import numpy as np
 import pandas as pd
+import h5py
+from policyengine_us_data.storage import STORAGE_FOLDER
+
+
+def create_calibration_log_file(file_path):
+    dataset = Dataset.from_file(file_path)
+
+    loss_matrix = build_loss_matrix(dataset, 2024)
+
+    sim = Microsimulation(dataset=dataset)
+
+    estimates = sim.calculate("household_weight", 2024).values @ loss_matrix[0]
+    target_names = loss_matrix[0].columns
+    target_values = loss_matrix[1]
+
+    df = pd.DataFrame(
+        {
+            "target_name": target_names,
+            "estimate": estimates,
+            "target": target_values,
+        }
+    )
+    df["epoch"] = 0
+    df["error"] = df["estimate"] - df["target"]
+    df["rel_error"] = df["error"] / df["target"]
+    df["abs_error"] = df["error"].abs()
+    df["rel_abs_error"] = df["abs_error"] / df["target"].abs()
+    df["loss"] = (df["rel_error"] ** 2).mean()
+
+    df.to_csv(file_path.replace(".h5", "_calibration_log.csv"), index=False)
+
+
+def minimise_dataset(
+    dataset, output_path: str, loss_rel_change_max: float
+) -> None:
+    create_calibration_log_file(dataset)
 
-def minimise_dataset(dataset, output_path: str, loss_rel_change_max: float) -> None:
-    # if loading from a .h5 file, need to do dataset = Dataset.from_file(dataset)
     loss_matrix = build_loss_matrix(dataset, 2024)
 
     sim = Microsimulation(dataset=dataset)
@@ -20,15 +54,20 @@ def minimise_dataset(dataset, output_path: str, loss_rel_change_max: float) -> N
     )
     weights @ estimate_matrix
 
-    def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor):
+    def get_loss_from_mask(
+        inclusion_mask, estimate_matrix, targets, normalisation_factor
+    ):
         """
         Calculate the loss based on the inclusion mask and the estimate matrix.
         """
         masked_weights = weights.copy()
         original_weight_total = masked_weights.sum()
-        masked_weights[~inclusion_mask] = 0
+        if (~inclusion_mask).sum() > 0:
+            masked_weights[~inclusion_mask] = 0
         masked_weight_total = masked_weights.sum()
-        masked_weights[inclusion_mask] *= original_weight_total / masked_weight_total
+        masked_weights[inclusion_mask] *= (
+            original_weight_total / masked_weight_total
+        )
         estimates = masked_weights @ estimate_matrix
         rel_error = ((estimates - targets) + 1) / (targets + 1)
         loss = ((rel_error * normalisation_factor) ** 2).mean()
@@ -36,15 +75,23 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f
         return loss
 
     COUNT_ITERATIONS = 5
+    VIEW_FRACTION_PER_ITERATION = 0.3
     FRACTION_REMOVE_PER_ITERATION = 0.1
     from tqdm import tqdm
 
     full_mask = np.ones_like(weights, dtype=bool)
     for i in range(COUNT_ITERATIONS):
         inclusion_mask = full_mask.copy()
-        baseline_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor)
+        baseline_loss = get_loss_from_mask(
+            inclusion_mask, estimate_matrix, targets, normalisation_factor
+        )
         household_loss_rel_changes = []
-        for household_index in tqdm(range(len(weights))):
+        indices = np.random.choice(
+            np.arange(len(weights)),
+            size=int(len(weights) * VIEW_FRACTION_PER_ITERATION),
+            replace=False,
+        )
+        for household_index in tqdm(indices):
             # Skip if this household is already excluded
             if not inclusion_mask[household_index]:
                 household_loss_rel_changes.append(np.inf)
@@ -52,7 +99,9 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f
             # Calculate loss if this household is removed
             inclusion_mask = inclusion_mask.copy()
             inclusion_mask[household_index] = False
-            loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor)
+            loss = get_loss_from_mask(
+                inclusion_mask, estimate_matrix, targets, normalisation_factor
+            )
             rel_change = (loss - baseline_loss) / baseline_loss
             household_loss_rel_changes.append(rel_change)
         inclusion_mask = full_mask.copy()
@@ -64,12 +113,24 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f
         worst_indices = sorted_indices[:num_to_remove]
         inclusion_mask[worst_indices] = False
         # Calculate the new loss
-        new_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor)
-        print(f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}")
-        print(f"Removed {num_to_remove} households with worst relative loss changes.")
+        new_loss = get_loss_from_mask(
+            inclusion_mask, estimate_matrix, targets, normalisation_factor
+        )
+        rel_change = (new_loss - baseline_loss) / baseline_loss
+        if rel_change > loss_rel_change_max:
+            print(
+                f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, which is too high ({rel_change:.2%}). Stopping."
+            )
+            break
+        print(
+            f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}"
+        )
+        print(
+            f"Removed {num_to_remove} households with worst relative loss changes."
+        )
         # Update the full mask
         full_mask &= inclusion_mask
-    
+
     household_ids = sim.calculate("household_id", 2024).values
     remaining_households = household_ids[full_mask]
 
@@ -78,8 +139,44 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f
     # I'm saving to a csv for ease of debugging, but we need to save to a .h5 file
 
     df = sim.to_input_dataframe()
-    df = df[df["household_id__2024"].isin(remaining_households)]
+    smaller_df = df[df["household_id__2024"].isin(remaining_households)]
+
+    weight_rel_change = (
+        smaller_df["household_weight__2024"].sum()
+        / df["household_weight__2024"].sum()
+    )
+    print(f"Weight relative change: {weight_rel_change:.2%}")
+
+    sim = Microsimulation(dataset=smaller_df)
+
+    sim.set_input(
+        "household_weight",
+        2024,
+        sim.calculate("household_weight", 2024).values / weight_rel_change,
+    )
+
+    data = {}
+
+    for variable in sim.input_variables:
+        data[variable] = {2024: sim.calculate(variable, 2024).values}
+        if data[variable][2024].dtype == "object":
+            data[variable][2024] = data[variable][2024].astype("S")
+
+    with h5py.File(output_path, "w") as f:
+        for variable, values in data.items():
+            for year, value in values.items():
+                f.create_dataset(f"{variable}/{year}", data=value)
+    print(f"Saved minimised dataset to {output_path}")
+
+    create_calibration_log_file(output_path)
+
 
-    df.to_csv(output_path, index=False)
+if __name__ == "__main__":
+    # Example usage
+    files = [
+        STORAGE_FOLDER / "enhanced_cps_2024.h5",
+    ]
 
-    return df
\ No newline at end of file
+    for file in files:
+        output_path = file.with_name(file.stem + "_minimised.h5")
+        minimise_dataset(file, output_path, loss_rel_change_max=10)

From 05ee7e4075293057756d24da0e23b36a6cfe3465 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Thu, 10 Jul 2025 23:50:52 +0100
Subject: [PATCH 03/58] Try L0

---
 Makefile                                       |  1 -
 .../datasets/cps/enhanced_cps.py               | 18 +++++++++++++++++-
 policyengine_us_data/utils/minimise.py         |  4 +++-
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 90b2817a..788ba1d3 100644
--- a/Makefile
+++ b/Makefile
@@ -46,7 +46,6 @@ data:
 	python policyengine_us_data/datasets/cps/extended_cps.py
 	python policyengine_us_data/datasets/cps/enhanced_cps.py
 	python policyengine_us_data/datasets/cps/small_enhanced_cps.py
-	python policyengine_us_data/utils/minimise.py
 
 clean:
 	rm -f policyengine_us_data/storage/*.h5
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index b8af12ce..9e61414c 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -59,9 +59,25 @@ def loss(weights):
             ((estimate - targets_array) + 1) / (targets_array + 1)
         ) ** 2
         rel_error_normalized = rel_error * normalisation_factor
+
+        # L0 penalty (approximated with smooth function)
+        # Since L0 is non-differentiable, we use a smooth approximation
+        # Common approaches:
+        
+        # Option 1: Sigmoid approximation
+        epsilon = 1e-3  # Threshold for "near zero"
+        l0_penalty_weight = 1e-1  # Adjust this hyperparameter
+        smoothed_l0 = torch.sigmoid((weights - epsilon) / (epsilon * 0.1)).mean()
+        
+        # Option 2: Log-sum penalty (smoother)
+        # smoothed_l0 = torch.log(1 + actual_weights / epsilon).sum() / len(actual_weights)
+        
+        # Option 3: Exponential penalty
+        # smoothed_l0 = (1 - torch.exp(-actual_weights / epsilon)).mean()
+
         if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
-        return rel_error_normalized.mean()
+        return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
 
     def dropout_weights(weights, p):
         if p == 0:
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 6fe511fd..2b122fec 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -32,14 +32,16 @@ def create_calibration_log_file(file_path):
     df["rel_abs_error"] = df["abs_error"] / df["target"].abs()
     df["loss"] = (df["rel_error"] ** 2).mean()
 
-    df.to_csv(file_path.replace(".h5", "_calibration_log.csv"), index=False)
+    df.to_csv(str(file_path).replace(".h5", "_calibration_log.csv"), index=False)
 
 
 def minimise_dataset(
     dataset, output_path: str, loss_rel_change_max: float
 ) -> None:
+    dataset = str(dataset)
     create_calibration_log_file(dataset)
 
+    dataset = Dataset.from_file(dataset)
     loss_matrix = build_loss_matrix(dataset, 2024)
 
     sim = Microsimulation(dataset=dataset)

From e38c6479483c9b2fb0cca9939c881995267a10d7 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@outlook.com>
Date: Thu, 10 Jul 2025 23:54:02 +0100
Subject: [PATCH 04/58] Format

---
 policyengine_us_data/datasets/cps/enhanced_cps.py | 10 ++++++----
 policyengine_us_data/utils/minimise.py            |  4 +++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 9e61414c..7d81a0c0 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -63,15 +63,17 @@ def loss(weights):
         # L0 penalty (approximated with smooth function)
         # Since L0 is non-differentiable, we use a smooth approximation
         # Common approaches:
-        
+
         # Option 1: Sigmoid approximation
         epsilon = 1e-3  # Threshold for "near zero"
         l0_penalty_weight = 1e-1  # Adjust this hyperparameter
-        smoothed_l0 = torch.sigmoid((weights - epsilon) / (epsilon * 0.1)).mean()
-        
+        smoothed_l0 = torch.sigmoid(
+            (weights - epsilon) / (epsilon * 0.1)
+        ).mean()
+
         # Option 2: Log-sum penalty (smoother)
         # smoothed_l0 = torch.log(1 + actual_weights / epsilon).sum() / len(actual_weights)
-        
+
         # Option 3: Exponential penalty
         # smoothed_l0 = (1 - torch.exp(-actual_weights / epsilon)).mean()
 
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 2b122fec..186a7673 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -32,7 +32,9 @@ def create_calibration_log_file(file_path):
     df["rel_abs_error"] = df["abs_error"] / df["target"].abs()
     df["loss"] = (df["rel_error"] ** 2).mean()
 
-    df.to_csv(str(file_path).replace(".h5", "_calibration_log.csv"), index=False)
+    df.to_csv(
+        str(file_path).replace(".h5", "_calibration_log.csv"), index=False
+    )
 
 
 def minimise_dataset(

From bdf3d6d89d16ac396786899ce3e3233c0c46ceb4 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 11 Jul 2025 16:24:22 +0200
Subject: [PATCH 05/58] attempting to vectorize minimizing of ecps

---
 changelog_entry.yaml                          |  4 +
 .../datasets/cps/enhanced_cps.py              | 27 +++---
 policyengine_us_data/utils/minimise.py        | 83 ++++++++++++++++---
 3 files changed, 91 insertions(+), 23 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e69de29b..84eeb584 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+    - Enhanced CPS minimizing tests.
\ No newline at end of file
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 7d81a0c0..bf303f7a 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -45,8 +45,8 @@ def reweight(
         np.log(original_weights), requires_grad=True, dtype=torch.float32
     )
 
-    # TODO: replace this with a call to the python reweight.py package.
-    def loss(weights):
+    # TO DO: replace this with a call to the python reweight.py package.
+    def loss(weights, penalty_approach="l0_sigmoid"):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -60,25 +60,32 @@ def loss(weights):
         ) ** 2
         rel_error_normalized = rel_error * normalisation_factor
 
+        if torch.isnan(rel_error_normalized).any():
+            raise ValueError("Relative error contains NaNs")
+
         # L0 penalty (approximated with smooth function)
         # Since L0 is non-differentiable, we use a smooth approximation
         # Common approaches:
 
-        # Option 1: Sigmoid approximation
         epsilon = 1e-3  # Threshold for "near zero"
         l0_penalty_weight = 1e-1  # Adjust this hyperparameter
-        smoothed_l0 = torch.sigmoid(
-            (weights - epsilon) / (epsilon * 0.1)
-        ).mean()
+
+        # Option 1: Sigmoid approximation
+        if penalty_approach == "l0_sigmoid":
+            smoothed_l0 = torch.sigmoid(
+                (weights - epsilon) / (epsilon * 0.1)
+            ).mean()
 
         # Option 2: Log-sum penalty (smoother)
-        # smoothed_l0 = torch.log(1 + actual_weights / epsilon).sum() / len(actual_weights)
+        if penalty_approach == "l0_log":
+            smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights)
 
         # Option 3: Exponential penalty
-        # smoothed_l0 = (1 - torch.exp(-actual_weights / epsilon)).mean()
+        if penalty_approach == "l0_exp":
+            smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
+
+        # L1 penalty
 
-        if torch.isnan(rel_error_normalized).any():
-            raise ValueError("Relative error contains NaNs")
         return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
 
     def dropout_weights(weights, p):
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 186a7673..94601d02 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import h5py
 from policyengine_us_data.storage import STORAGE_FOLDER
+from typing import Optional
 
 
 def create_calibration_log_file(file_path):
@@ -37,6 +38,57 @@ def create_calibration_log_file(file_path):
     )
 
 
+def losses_for_candidates(
+    base_weights: np.ndarray,
+    idxs: np.ndarray,
+    est_mat: np.ndarray,
+    targets: np.ndarray,
+    norm: np.ndarray,
+    chunk_size: Optional[int] = 25_000,
+) -> np.ndarray:
+    """
+    Return the loss value *for each* candidate deletion in `idxs`
+    in one matrix multiplication.
+
+    Parameters
+    ----------
+    base_weights : (n,) original weight vector
+    idxs         : (k,) candidate row indices to zero-out
+    est_mat      : (n, m) estimate matrix
+    targets      : (m,) calibration targets
+    norm         : (m,) normalisation factors
+    chunk_size   : max number of candidates to process at once
+
+    Returns
+    -------
+    losses       : (k,) loss if row i were removed (and weights rescaled)
+    """
+    W = base_weights
+    total = W.sum()
+    k = len(idxs)
+    losses = np.empty(k, dtype=float)
+
+    # Work through the candidate list in blocks
+    for start in range(0, k, chunk_size):
+        stop = min(start + chunk_size, k)
+        part = idxs[start:stop]  # (p,) where p ≤ chunk_size
+        p = len(part)
+
+        # Build the delta matrix only for this chunk
+        delta = np.zeros((p, len(W)))
+        delta[np.arange(p), part] = -W[part]
+
+        keep_total = total + delta.sum(axis=1)  # (p,)
+        delta *= (total / keep_total)[:, None]
+
+        # Matrix–matrix multiply → one matrix multiplication per chunk
+        ests = (W + delta) @ est_mat  # (p, m)
+        rel_err = ((ests - targets) + 1) / (targets + 1)
+        losses[start:stop] = ((rel_err * norm) ** 2).mean(axis=1)
+
+    return losses
+
+
 def minimise_dataset(
     dataset, output_path: str, loss_rel_change_max: float
 ) -> None:
@@ -95,19 +147,24 @@ def get_loss_from_mask(
             size=int(len(weights) * VIEW_FRACTION_PER_ITERATION),
             replace=False,
         )
-        for household_index in tqdm(indices):
-            # Skip if this household is already excluded
-            if not inclusion_mask[household_index]:
-                household_loss_rel_changes.append(np.inf)
-                continue
-            # Calculate loss if this household is removed
-            inclusion_mask = inclusion_mask.copy()
-            inclusion_mask[household_index] = False
-            loss = get_loss_from_mask(
-                inclusion_mask, estimate_matrix, targets, normalisation_factor
-            )
-            rel_change = (loss - baseline_loss) / baseline_loss
-            household_loss_rel_changes.append(rel_change)
+
+        # more efficient approach to compute losses for candidate households to be removed
+
+        # 1. sample only households that are currently *included*
+        indices = np.random.choice(
+            np.where(full_mask)[0],
+            size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION),
+            replace=False,
+        )
+        # 2. compute losses for the batch in one shot
+        candidate_losses = losses_for_candidates(
+            weights, indices, estimate_matrix, targets, normalisation_factor
+        )
+        # 3. convert to relative change vs. baseline
+        household_loss_rel_changes = (
+            candidate_losses - baseline_loss
+        ) / baseline_loss
+
         inclusion_mask = full_mask.copy()
         household_loss_rel_changes = np.array(household_loss_rel_changes)
         # Sort by the relative change in loss

From 03e5d0d380494b698cbcb4af14b5c8eb256754d0 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 11 Jul 2025 16:52:43 +0200
Subject: [PATCH 06/58] adding random sampling minimization strategy

---
 policyengine_us_data/utils/minimise.py | 240 ++++++++++++++++++-------
 1 file changed, 173 insertions(+), 67 deletions(-)

diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 94601d02..45212905 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import h5py
 from policyengine_us_data.storage import STORAGE_FOLDER
-from typing import Optional
+from typing import Optional, Callable
 
 
 def create_calibration_log_file(file_path):
@@ -89,116 +89,214 @@ def losses_for_candidates(
     return losses
 
 
-def minimise_dataset(
-    dataset, output_path: str, loss_rel_change_max: float
-) -> None:
-    dataset = str(dataset)
-    create_calibration_log_file(dataset)
+def get_loss_from_mask(
+    weights, inclusion_mask, estimate_matrix, targets, normalisation_factor
+):
+    """
+    Calculate the loss based on the inclusion mask and the estimate matrix.
+    """
+    masked_weights = weights.copy()
+    original_weight_total = masked_weights.sum()
+    if (~inclusion_mask).sum() > 0:
+        masked_weights[~inclusion_mask] = 0
+    masked_weight_total = masked_weights.sum()
+    masked_weights[inclusion_mask] *= (
+        original_weight_total / masked_weight_total
+    )
+    estimates = masked_weights @ estimate_matrix
+    rel_error = ((estimates - targets) + 1) / (targets + 1)
+    loss = ((rel_error * normalisation_factor) ** 2).mean()
 
-    dataset = Dataset.from_file(dataset)
-    loss_matrix = build_loss_matrix(dataset, 2024)
+    return loss
 
-    sim = Microsimulation(dataset=dataset)
 
-    weights = sim.calculate("household_weight", 2024).values
-    estimate_matrix, targets = loss_matrix
-    is_national = estimate_matrix.columns.str.startswith("nation/")
-    nation_normalisation_factor = is_national * (1 / is_national.sum())
-    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
-    normalisation_factor = np.where(
-        is_national, nation_normalisation_factor, state_normalisation_factor
-    )
-    weights @ estimate_matrix
-
-    def get_loss_from_mask(
-        inclusion_mask, estimate_matrix, targets, normalisation_factor
-    ):
-        """
-        Calculate the loss based on the inclusion mask and the estimate matrix.
-        """
-        masked_weights = weights.copy()
-        original_weight_total = masked_weights.sum()
-        if (~inclusion_mask).sum() > 0:
-            masked_weights[~inclusion_mask] = 0
-        masked_weight_total = masked_weights.sum()
-        masked_weights[inclusion_mask] *= (
-            original_weight_total / masked_weight_total
-        )
-        estimates = masked_weights @ estimate_matrix
-        rel_error = ((estimates - targets) + 1) / (targets + 1)
-        loss = ((rel_error * normalisation_factor) ** 2).mean()
+def candidate_loss_contribution(
+    weights: np.ndarray,
+    estimate_matrix: np.ndarray,
+    targets: np.ndarray,
+    normalisation_factor: np.ndarray,
+    loss_rel_change_max: float,
+    count_iterations: int = 5,
+    view_fraction_per_iteration: float = 0.3,
+    fraction_remove_per_iteration: float = 0.1,
+) -> np.ndarray:
+    """
+    Minimization approach based on candidate loss contribution.
+
+    This function iteratively removes households that contribute least to the loss,
+    maintaining the calibration quality within the specified tolerance.
 
-        return loss
+    Parameters
+    ----------
+    weights : (n,) household weights
+    estimate_matrix : (n, m) matrix mapping weights to estimates
+    targets : (m,) calibration targets
+    normalisation_factor : (m,) normalisation factors for different targets
+    loss_rel_change_max : maximum allowed relative change in loss
+    count_iterations : number of iterations to perform
+    view_fraction_per_iteration : fraction of households to evaluate each iteration
+    fraction_remove_per_iteration : fraction of households to remove each iteration
 
-    COUNT_ITERATIONS = 5
-    VIEW_FRACTION_PER_ITERATION = 0.3
-    FRACTION_REMOVE_PER_ITERATION = 0.1
+    Returns
+    -------
+    inclusion_mask : (n,) boolean mask of households to keep
+    """
     from tqdm import tqdm
 
     full_mask = np.ones_like(weights, dtype=bool)
-    for i in range(COUNT_ITERATIONS):
+
+    for i in range(count_iterations):
         inclusion_mask = full_mask.copy()
         baseline_loss = get_loss_from_mask(
-            inclusion_mask, estimate_matrix, targets, normalisation_factor
+            weights,
+            inclusion_mask,
+            estimate_matrix,
+            targets,
+            normalisation_factor,
         )
-        household_loss_rel_changes = []
-        indices = np.random.choice(
-            np.arange(len(weights)),
-            size=int(len(weights) * VIEW_FRACTION_PER_ITERATION),
-            replace=False,
-        )
-
-        # more efficient approach to compute losses for candidate households to be removed
 
-        # 1. sample only households that are currently *included*
+        # Sample only households that are currently included
         indices = np.random.choice(
             np.where(full_mask)[0],
-            size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION),
+            size=int(full_mask.sum() * view_fraction_per_iteration),
             replace=False,
         )
-        # 2. compute losses for the batch in one shot
+
+        # Compute losses for the batch in one shot
         candidate_losses = losses_for_candidates(
             weights, indices, estimate_matrix, targets, normalisation_factor
         )
-        # 3. convert to relative change vs. baseline
+
+        # Convert to relative change vs. baseline
         household_loss_rel_changes = (
             candidate_losses - baseline_loss
         ) / baseline_loss
 
-        inclusion_mask = full_mask.copy()
-        household_loss_rel_changes = np.array(household_loss_rel_changes)
         # Sort by the relative change in loss
         sorted_indices = np.argsort(household_loss_rel_changes)
+
         # Remove the worst households
-        num_to_remove = int(len(weights) * FRACTION_REMOVE_PER_ITERATION)
-        worst_indices = sorted_indices[:num_to_remove]
+        num_to_remove = int(len(weights) * fraction_remove_per_iteration)
+        worst_indices = indices[sorted_indices[:num_to_remove]]
         inclusion_mask[worst_indices] = False
+
         # Calculate the new loss
         new_loss = get_loss_from_mask(
-            inclusion_mask, estimate_matrix, targets, normalisation_factor
+            weights,
+            inclusion_mask,
+            estimate_matrix,
+            targets,
+            normalisation_factor,
         )
         rel_change = (new_loss - baseline_loss) / baseline_loss
+
         if rel_change > loss_rel_change_max:
             print(
-                f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, which is too high ({rel_change:.2%}). Stopping."
+                f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, "
+                f"which is too high ({rel_change:.2%}). Stopping."
             )
             break
+
         print(
             f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}"
         )
         print(
             f"Removed {num_to_remove} households with worst relative loss changes."
         )
+
         # Update the full mask
         full_mask &= inclusion_mask
 
-    household_ids = sim.calculate("household_id", 2024).values
-    remaining_households = household_ids[full_mask]
+    return full_mask
+
+
+def random_sampling_minimization(
+    weights,
+    estimate_matrix,
+    targets,
+    normalisation_factor,
+    target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5],
+):
+    """A simple random sampling approach"""
+    n = len(weights)
+
+    final_mask = None
+    lowest_loss = float("inf")
+    for fraction in target_fractions:
+        target_size = int(n * fraction)
+        # Random sampling with multiple attempts
+        best_mask = None
+        best_loss = float("inf")
+
+        for _ in range(5):  # Try 5 random samples
+            mask = np.zeros(n, dtype=bool)
+            mask[np.random.choice(n, target_size, replace=False)] = True
+
+            loss = get_loss_from_mask(
+                weights, mask, estimate_matrix, targets, normalisation_factor
+            )
+
+            if loss < best_loss:
+                best_loss = loss
+                best_mask = mask
+
+        if lowest_loss > best_loss:
+            lowest_loss = best_loss
+            final_mask = best_mask
+
+    return final_mask
+
+
+def minimise_dataset(
+    dataset,
+    output_path: str,
+    loss_rel_change_max: float,
+    minimization_function: Callable = candidate_loss_contribution,
+    **kwargs,
+) -> None:
+    """
+    Main function to minimize a dataset using a specified minimization approach.
+
+    Parameters
+    ----------
+    dataset : path to the dataset file or Dataset object
+    output_path : path where the minimized dataset will be saved
+    loss_rel_change_max : maximum allowed relative change in loss
+    minimization_function : function that implements the minimization logic
+    **kwargs : additional arguments to pass to the minimization function
+    """
+    dataset = str(dataset)
+    create_calibration_log_file(dataset)
+
+    dataset = Dataset.from_file(dataset)
+    loss_matrix = build_loss_matrix(dataset, 2024)
 
-    # At this point we have a mask of households to keep
+    sim = Microsimulation(dataset=dataset)
 
-    # I'm saving to a csv for ease of debugging, but we need to save to a .h5 file
+    weights = sim.calculate("household_weight", 2024).values
+    estimate_matrix, targets = loss_matrix
+    is_national = estimate_matrix.columns.str.startswith("nation/")
+    nation_normalisation_factor = is_national * (1 / is_national.sum())
+    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
+    normalisation_factor = np.where(
+        is_national, nation_normalisation_factor, state_normalisation_factor
+    )
+
+    # Call the minimization function
+    inclusion_mask = minimization_function(
+        weights=weights,
+        estimate_matrix=estimate_matrix,
+        targets=targets,
+        normalisation_factor=normalisation_factor,
+        loss_rel_change_max=loss_rel_change_max,
+        **kwargs,
+    )
+
+    # Extract household IDs for remaining households
+    household_ids = sim.calculate("household_id", 2024).values
+    remaining_households = household_ids[inclusion_mask]
 
+    # Create a smaller dataset with only the remaining households
     df = sim.to_input_dataframe()
     smaller_df = df[df["household_id__2024"].isin(remaining_households)]
 
@@ -208,27 +306,30 @@ def get_loss_from_mask(
     )
     print(f"Weight relative change: {weight_rel_change:.2%}")
 
+    # Create new simulation with smaller dataset
     sim = Microsimulation(dataset=smaller_df)
 
+    # Rescale weights to maintain total
     sim.set_input(
         "household_weight",
         2024,
         sim.calculate("household_weight", 2024).values / weight_rel_change,
     )
 
+    # Prepare data for saving
     data = {}
-
     for variable in sim.input_variables:
         data[variable] = {2024: sim.calculate(variable, 2024).values}
         if data[variable][2024].dtype == "object":
             data[variable][2024] = data[variable][2024].astype("S")
 
+    # Save to HDF5 file
     with h5py.File(output_path, "w") as f:
         for variable, values in data.items():
             for year, value in values.items():
                 f.create_dataset(f"{variable}/{year}", data=value)
-    print(f"Saved minimised dataset to {output_path}")
 
+    print(f"Saved minimised dataset to {output_path}")
     create_calibration_log_file(output_path)
 
 
@@ -240,4 +341,9 @@ def get_loss_from_mask(
 
     for file in files:
         output_path = file.with_name(file.stem + "_minimised.h5")
-        minimise_dataset(file, output_path, loss_rel_change_max=10)
+        minimise_dataset(
+            file,
+            output_path,
+            loss_rel_change_max=10,
+            minimization_function=candidate_loss_contribution,
+        )

From cd0776c0eb7d1745e987ace34ecc4b56306eee2b Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 11 Jul 2025 19:19:58 +0200
Subject: [PATCH 07/58] add notebook with testing functionality (havent tested
 locally)

---
 .../datasets/cps/enhanced_cps.py              |   8 +-
 policyengine_us_data/utils/minimise.py        |   2 +-
 test_minimization_approach.ipynb              | 107 ++++++++++++++++++
 3 files changed, 115 insertions(+), 2 deletions(-)
 create mode 100644 test_minimization_approach.ipynb

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index bf303f7a..08798622 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -28,6 +28,7 @@ def reweight(
     targets_array,
     dropout_rate=0.05,
     log_path="calibration_log.csv",
+    penalty_approach="l0_sigmoid",
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -46,7 +47,7 @@ def reweight(
     )
 
     # TO DO: replace this with a call to the python reweight.py package.
-    def loss(weights, penalty_approach="l0_sigmoid"):
+    def loss(weights, penalty_approach=penalty_approach):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -85,6 +86,11 @@ def loss(weights, penalty_approach="l0_sigmoid"):
             smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
 
         # L1 penalty
+        l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
+
+        if penalty_approach == "l1":
+            l1 = torch.mean(weights)
+            return rel_error_normalized.mean() + l1_penalty_weight * l1
 
         return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
 
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 45212905..a9ba3959 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -330,7 +330,7 @@ def minimise_dataset(
                 f.create_dataset(f"{variable}/{year}", data=value)
 
     print(f"Saved minimised dataset to {output_path}")
-    create_calibration_log_file(output_path)
+    create_calibration_log_file(output_path)    
 
 
 if __name__ == "__main__":
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
new file mode 100644
index 00000000..519d2725
--- /dev/null
+++ b/test_minimization_approach.ipynb
@@ -0,0 +1,107 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d6dc9cca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n",
+    "from policyengine_us_data.storage import STORAGE_FOLDER\n",
+    "from policyengine_us import Microsimulation\n",
+    "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
+    "from policyengine_us_data.utils import build_loss_matrix\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db975ac1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
+    "\n",
+    "files = [\n",
+    "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
+    "    ]\n",
+    "\n",
+    "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n",
+    "minimization_function = random_sampling_minimization\n",
+    "# other minimization function approach is \"candidate_loss_contribution\"\n",
+    "\n",
+    "for file in files:\n",
+    "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "    minimise_dataset(\n",
+    "        file,\n",
+    "        output_path,\n",
+    "        loss_rel_change_max=10,\n",
+    "        minimization_function=minimization_function, \n",
+    "        target_fractions=[0.5] # remove if switching approach\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35892c9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
+    "\n",
+    "input_dataset = ExtendedCPS_2024\n",
+    "\n",
+    "approach = \"l0_sigmoid\"\n",
+    "# other options are \"l0_log\", \"l0_exp\", \"l1\"\n",
+    "\n",
+    "sim = Microsimulation(dataset=input_dataset)\n",
+    "data = sim.dataset.load_dataset()\n",
+    "data[\"household_weight\"] = {}\n",
+    "original_weights = sim.calculate(\"household_weight\")\n",
+    "original_weights = original_weights.values + np.random.normal(\n",
+    "    1, 0.1, len(original_weights)\n",
+    ")\n",
+    "for year in range(2024, 2025):\n",
+    "    loss_matrix, targets_array = build_loss_matrix(\n",
+    "        input_dataset, year\n",
+    "    )\n",
+    "    optimised_weights = reweight(\n",
+    "        original_weights,\n",
+    "        loss_matrix,\n",
+    "        targets_array,\n",
+    "        log_path= STORAGE_FOLDER / approach / \"calibration_log.csv\",\n",
+    "        penalty_approach=approach,\n",
+    "    )\n",
+    "    data[\"household_weight\"][year] = optimised_weights\n",
+    "\n",
+    "output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "\n",
+    "data.save_dataset(output_path)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pe",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 2c050fc973ba312d070c27dcb7f1fb049e1e2af2 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 11 Jul 2025 19:20:55 +0200
Subject: [PATCH 08/58] lint

---
 policyengine_us_data/utils/minimise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index a9ba3959..45212905 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -330,7 +330,7 @@ def minimise_dataset(
                 f.create_dataset(f"{variable}/{year}", data=value)
 
     print(f"Saved minimised dataset to {output_path}")
-    create_calibration_log_file(output_path)    
+    create_calibration_log_file(output_path)
 
 
 if __name__ == "__main__":

From ee98fc36ab920d571982862dc48d950b7a58ec3d Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Fri, 11 Jul 2025 20:06:32 -0400
Subject: [PATCH 09/58] debugged 2nd cell: created path & removed optional
 parameters.

---
 policyengine_us_data/utils/minimise.py |   8 +-
 test_minimization_approach.ipynb       | 219 +++++++++++++++++++++++--
 2 files changed, 210 insertions(+), 17 deletions(-)

diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 45212905..e84e1bee 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -250,10 +250,11 @@ def random_sampling_minimization(
 def minimise_dataset(
     dataset,
     output_path: str,
-    loss_rel_change_max: float,
     minimization_function: Callable = candidate_loss_contribution,
     **kwargs,
 ) -> None:
+    #loss_rel_change_max = kwargs.pop('loss_rel_change_max', 10.0)
+
     """
     Main function to minimize a dataset using a specified minimization approach.
 
@@ -288,8 +289,7 @@ def minimise_dataset(
         estimate_matrix=estimate_matrix,
         targets=targets,
         normalisation_factor=normalisation_factor,
-        loss_rel_change_max=loss_rel_change_max,
-        **kwargs,
+        **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
     )
 
     # Extract household IDs for remaining households
@@ -344,6 +344,4 @@ def minimise_dataset(
         minimise_dataset(
             file,
             output_path,
-            loss_rel_change_max=10,
-            minimization_function=candidate_loss_contribution,
         )
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 519d2725..8400d4fe 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -12,15 +12,188 @@
     "from policyengine_us import Microsimulation\n",
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "import os\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "db975ac1",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n",
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n",
+      "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Weight relative change: 52.19%\n",
+      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n"
+     ]
+    }
+   ],
    "source": [
     "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
     "\n",
@@ -28,27 +201,49 @@
     "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
     "    ]\n",
     "\n",
-    "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n",
-    "minimization_function = random_sampling_minimization\n",
-    "# other minimization function approach is \"candidate_loss_contribution\"\n",
+    "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n",
+    "minimization_function = candidate_loss_contribution\n",
+    "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n",
     "\n",
     "for file in files:\n",
     "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
     "    minimise_dataset(\n",
     "        file,\n",
     "        output_path,\n",
     "        loss_rel_change_max=10,\n",
     "        minimization_function=minimization_function, \n",
-    "        target_fractions=[0.5] # remove if switching approach\n",
+    "        #target_fractions=[0.5] # remove if switching approach\n",
     "    )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "35892c9d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[5], line 8\u001b[0m\n\u001b[1;32m      5\u001b[0m approach \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ml0_sigmoid\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m# other options are \"l0_log\", \"l0_exp\", \"l1\"\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_dataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      9\u001b[0m data \u001b[38;5;241m=\u001b[39m sim\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39mload_dataset()\n\u001b[1;32m     10\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {}\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m    215\u001b[0m     dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    216\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m    217\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m    218\u001b[0m ):\n\u001b[1;32m    219\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m    224\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m    225\u001b[0m )\n\u001b[1;32m    226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m    180\u001b[0m         dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m    181\u001b[0m             dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    182\u001b[0m         )\n\u001b[1;32m    183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m    186\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m    187\u001b[0m         dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    188\u001b[0m     )\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:103\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m    101\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload()\n\u001b[1;32m    102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 103\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/extended_cps.py:147\u001b[0m, in \u001b[0;36mExtendedCPS.generate\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    144\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Microsimulation\n\u001b[1;32m    146\u001b[0m cps_sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcps)\n\u001b[0;32m--> 147\u001b[0m puf_sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpuf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    149\u001b[0m puf_sim\u001b[38;5;241m.\u001b[39msubsample(\u001b[38;5;241m10_000\u001b[39m)\n\u001b[1;32m    151\u001b[0m INPUTS \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    152\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mage\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    153\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_male\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    158\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_tax_unit_dependent\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    159\u001b[0m ]\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m    215\u001b[0m     dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    216\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m    217\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m    218\u001b[0m ):\n\u001b[1;32m    219\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m    224\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m    225\u001b[0m )\n\u001b[1;32m    226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m    180\u001b[0m         dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m    181\u001b[0m             dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    182\u001b[0m         )\n\u001b[1;32m    183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m    186\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m    187\u001b[0m         dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    188\u001b[0m     )\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:101\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m     99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexists \u001b[38;5;129;01mand\u001b[39;00m require:\n\u001b[1;32m    100\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39murl \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    102\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    103\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgenerate()\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:343\u001b[0m, in \u001b[0;36mDataset.download\u001b[0;34m(self, url, version)\u001b[0m\n\u001b[1;32m    341\u001b[0m response \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mget(url, headers\u001b[38;5;241m=\u001b[39mauth_headers)\n\u001b[1;32m    342\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[0;32m--> 343\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    344\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid response code \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for url \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    345\u001b[0m     )\n\u001b[1;32m    346\u001b[0m assets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massets\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m    347\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m asset \u001b[38;5;129;01min\u001b[39;00m assets:\n",
+      "\u001b[0;31mValueError\u001b[0m: Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0."
+     ]
+    }
+   ],
    "source": [
     "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
     "\n",
@@ -85,7 +280,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "pe",
+   "display_name": "policyengine-us-data",
    "language": "python",
    "name": "python3"
   },
@@ -99,7 +294,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.11"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,

From f6d7f0fa00f158f099c2dc15116fac4987d33085 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Mon, 14 Jul 2025 15:22:58 +0200
Subject: [PATCH 10/58] few updates to the testing framework

---
 changelog_entry.yaml                          |  2 +-
 .../datasets/cps/enhanced_cps.py              | 78 +++++++++++++------
 policyengine_us_data/utils/minimise.py        | 75 +++++++++++++-----
 pyproject.toml                                |  4 +-
 test_minimization_approach.ipynb              | 75 +++++++++---------
 5 files changed, 149 insertions(+), 85 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 84eeb584..ac664753 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,4 +1,4 @@
 - bump: minor
   changes:
     added:
-    - Enhanced CPS minimizing tests.
\ No newline at end of file
+    - Enhanced CPS minimizing tests. 
\ No newline at end of file
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 08798622..6ad510f3 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -22,13 +22,25 @@
     torch = None
 
 
+bad_targets = [
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+]
+
+
 def reweight(
     original_weights,
     loss_matrix,
     targets_array,
     dropout_rate=0.05,
     log_path="calibration_log.csv",
-    penalty_approach="l0_sigmoid",
+    penalty_approach=None,
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -64,35 +76,43 @@ def loss(weights, penalty_approach=penalty_approach):
         if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
 
-        # L0 penalty (approximated with smooth function)
-        # Since L0 is non-differentiable, we use a smooth approximation
-        # Common approaches:
+        if penalty_approach is not None:
+            # L0 penalty (approximated with smooth function)
+            # Since L0 is non-differentiable, we use a smooth approximation
+            # Common approaches:
+
+            epsilon = 1e-3  # Threshold for "near zero"
+            l0_penalty_weight = 1e-1  # Adjust this hyperparameter
 
-        epsilon = 1e-3  # Threshold for "near zero"
-        l0_penalty_weight = 1e-1  # Adjust this hyperparameter
+            # Option 1: Sigmoid approximation
+            if penalty_approach == "l0_sigmoid":
+                smoothed_l0 = torch.sigmoid(
+                    (weights - epsilon) / (epsilon * 0.1)
+                ).mean()
 
-        # Option 1: Sigmoid approximation
-        if penalty_approach == "l0_sigmoid":
-            smoothed_l0 = torch.sigmoid(
-                (weights - epsilon) / (epsilon * 0.1)
-            ).mean()
+            # Option 2: Log-sum penalty (smoother)
+            if penalty_approach == "l0_log":
+                smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(
+                    weights
+                )
 
-        # Option 2: Log-sum penalty (smoother)
-        if penalty_approach == "l0_log":
-            smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights)
+            # Option 3: Exponential penalty
+            if penalty_approach == "l0_exp":
+                smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
 
-        # Option 3: Exponential penalty
-        if penalty_approach == "l0_exp":
-            smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
+            # L1 penalty
+            l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
 
-        # L1 penalty
-        l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
+            if penalty_approach == "l1":
+                l1 = torch.mean(weights)
+                return rel_error_normalized.mean() + l1_penalty_weight * l1
 
-        if penalty_approach == "l1":
-            l1 = torch.mean(weights)
-            return rel_error_normalized.mean() + l1_penalty_weight * l1
+            return (
+                rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
+            )
 
-        return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
+        else:
+            return rel_error_normalized.mean()
 
     def dropout_weights(weights, p):
         if p == 0:
@@ -213,10 +233,18 @@ def generate(self):
             loss_matrix, targets_array = build_loss_matrix(
                 self.input_dataset, year
             )
+
+            bad_mask = loss_matrix.columns.isin(bad_targets)
+            keep_mask_bool = ~bad_mask
+            keep_idx = np.where(keep_mask_bool)[0]
+            loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+            targets_array_clean = targets_array[keep_idx]
+            assert loss_matrix_clean.shape[1] == targets_array_clean.size
+
             optimised_weights = reweight(
                 original_weights,
-                loss_matrix,
-                targets_array,
+                loss_matrix_clean,
+                targets_array_clean,
                 log_path="calibration_log.csv",
             )
             data["household_weight"][year] = optimised_weights
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index e84e1bee..df193c6e 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -7,30 +7,53 @@
 from policyengine_us_data.storage import STORAGE_FOLDER
 from typing import Optional, Callable
 
-
-def create_calibration_log_file(file_path):
+bad_targets = [
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+]
+
+
+def create_calibration_log_file(file_path, epoch=0):
     dataset = Dataset.from_file(file_path)
 
-    loss_matrix = build_loss_matrix(dataset, 2024)
+    loss_matrix, targets = build_loss_matrix(dataset, 2024)
+
+    bad_mask = loss_matrix.columns.isin(bad_targets)
+    keep_mask_bool = ~bad_mask
+    keep_idx = np.where(keep_mask_bool)[0]
+    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+    targets_clean = targets[keep_idx]
+    assert loss_matrix_clean.shape[1] == targets_clean.size
 
     sim = Microsimulation(dataset=dataset)
 
-    estimates = sim.calculate("household_weight", 2024).values @ loss_matrix[0]
-    target_names = loss_matrix[0].columns
-    target_values = loss_matrix[1]
+    estimates = (
+        sim.calculate("household_weight", 2024).values @ loss_matrix_clean
+    )
+    target_names = loss_matrix_clean.columns
 
     df = pd.DataFrame(
         {
             "target_name": target_names,
             "estimate": estimates,
-            "target": target_values,
+            "target": targets_clean,
         }
     )
-    df["epoch"] = 0
+    df["epoch"] = epoch
     df["error"] = df["estimate"] - df["target"]
     df["rel_error"] = df["error"] / df["target"]
     df["abs_error"] = df["error"].abs()
-    df["rel_abs_error"] = df["abs_error"] / df["target"].abs()
+    df["rel_abs_error"] = (
+        df["abs_error"] / df["target"].abs()
+        if df["target"].abs().sum() > 0
+        else np.nan
+    )
     df["loss"] = (df["rel_error"] ** 2).mean()
 
     df.to_csv(
@@ -215,11 +238,14 @@ def random_sampling_minimization(
     estimate_matrix,
     targets,
     normalisation_factor,
+    random=True,
     target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5],
 ):
     """A simple random sampling approach"""
     n = len(weights)
 
+    household_weights_normalized = weights / weights.sum()
+
     final_mask = None
     lowest_loss = float("inf")
     for fraction in target_fractions:
@@ -230,7 +256,14 @@ def random_sampling_minimization(
 
         for _ in range(5):  # Try 5 random samples
             mask = np.zeros(n, dtype=bool)
-            mask[np.random.choice(n, target_size, replace=False)] = True
+            mask[
+                np.random.choice(
+                    n,
+                    target_size,
+                    p=household_weights_normalized if random else None,
+                    replace=False,
+                )
+            ] = True
 
             loss = get_loss_from_mask(
                 weights, mask, estimate_matrix, targets, normalisation_factor
@@ -253,8 +286,6 @@ def minimise_dataset(
     minimization_function: Callable = candidate_loss_contribution,
     **kwargs,
 ) -> None:
-    #loss_rel_change_max = kwargs.pop('loss_rel_change_max', 10.0)
-
     """
     Main function to minimize a dataset using a specified minimization approach.
 
@@ -270,13 +301,19 @@ def minimise_dataset(
     create_calibration_log_file(dataset)
 
     dataset = Dataset.from_file(dataset)
-    loss_matrix = build_loss_matrix(dataset, 2024)
+    loss_matrix, targets = build_loss_matrix(dataset, 2024)
+
+    bad_mask = loss_matrix.columns.isin(bad_targets)
+    keep_mask_bool = ~bad_mask
+    keep_idx = np.where(keep_mask_bool)[0]
+    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+    targets_clean = targets[keep_idx]
+    assert loss_matrix_clean.shape[1] == targets_clean.size
 
     sim = Microsimulation(dataset=dataset)
 
     weights = sim.calculate("household_weight", 2024).values
-    estimate_matrix, targets = loss_matrix
-    is_national = estimate_matrix.columns.str.startswith("nation/")
+    is_national = loss_matrix_clean.columns.str.startswith("nation/")
     nation_normalisation_factor = is_national * (1 / is_national.sum())
     state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
     normalisation_factor = np.where(
@@ -286,10 +323,10 @@ def minimise_dataset(
     # Call the minimization function
     inclusion_mask = minimization_function(
         weights=weights,
-        estimate_matrix=estimate_matrix,
-        targets=targets,
+        estimate_matrix=loss_matrix_clean,
+        targets=targets_clean,
         normalisation_factor=normalisation_factor,
-        **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
+        **kwargs,  # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
     )
 
     # Extract household IDs for remaining households
@@ -330,7 +367,7 @@ def minimise_dataset(
                 f.create_dataset(f"{variable}/{year}", data=value)
 
     print(f"Saved minimised dataset to {output_path}")
-    create_calibration_log_file(output_path)
+    create_calibration_log_file(output_path, epoch=500)
 
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
index 0352db69..65d1ca8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,8 +17,8 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us>=1.197.0",
-    "policyengine-core>=3.14.1",
+    "policyengine-us>=1.340.0",
+    "policyengine-core>=3.17.1",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 8400d4fe..54f3c6fa 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -13,12 +13,24 @@
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
     "import numpy as np\n",
-    "import os\n"
+    "import os"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
+   "id": "6daabe7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Original ECPS 2024 dataset size (for household entity): 41310\n",
+    "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n",
+    "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "id": "db975ac1",
    "metadata": {},
    "outputs": [
@@ -128,18 +140,17 @@
       "Targeting Medicaid enrollment for WI with target 1108320k\n",
       "Targeting Medicaid enrollment for WV with target 467632k\n",
       "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n",
+      "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n",
+      "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n",
+      "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n",
+      "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Weight relative change: 52.19%\n",
-      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
+      "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n",
+      "Weight relative change: 99.10%\n",
+      "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
       "Targeting Medicaid enrollment for AK with target 231577k\n",
       "Targeting Medicaid enrollment for AL with target 766009k\n",
       "Targeting Medicaid enrollment for AR with target 733561k\n",
@@ -203,7 +214,7 @@
     "\n",
     "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n",
     "minimization_function = candidate_loss_contribution\n",
-    "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n",
+    "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
     "\n",
     "for file in files:\n",
     "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
@@ -212,38 +223,18 @@
     "    minimise_dataset(\n",
     "        file,\n",
     "        output_path,\n",
-    "        loss_rel_change_max=10,\n",
     "        minimization_function=minimization_function, \n",
-    "        #target_fractions=[0.5] # remove if switching approach\n",
+    "        # target_fractions=[0.5] # remove if switching approach\n",
+    "        loss_rel_change_max=0.0001,  # remove if switching approach\n",
     "    )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "35892c9d",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "ValueError",
-     "evalue": "Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0.",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[5], line 8\u001b[0m\n\u001b[1;32m      5\u001b[0m approach \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ml0_sigmoid\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m# other options are \"l0_log\", \"l0_exp\", \"l1\"\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_dataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      9\u001b[0m data \u001b[38;5;241m=\u001b[39m sim\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39mload_dataset()\n\u001b[1;32m     10\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {}\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m    215\u001b[0m     dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    216\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m    217\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m    218\u001b[0m ):\n\u001b[1;32m    219\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m    224\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m    225\u001b[0m )\n\u001b[1;32m    226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m    180\u001b[0m         dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m    181\u001b[0m             dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    182\u001b[0m         )\n\u001b[1;32m    183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m    186\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m    187\u001b[0m         dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    188\u001b[0m     )\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:103\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m    101\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload()\n\u001b[1;32m    102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 103\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/extended_cps.py:147\u001b[0m, in \u001b[0;36mExtendedCPS.generate\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    144\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Microsimulation\n\u001b[1;32m    146\u001b[0m cps_sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcps)\n\u001b[0;32m--> 147\u001b[0m puf_sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpuf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    149\u001b[0m puf_sim\u001b[38;5;241m.\u001b[39msubsample(\u001b[38;5;241m10_000\u001b[39m)\n\u001b[1;32m    151\u001b[0m INPUTS \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    152\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mage\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    153\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_male\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    158\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_tax_unit_dependent\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    159\u001b[0m ]\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m    215\u001b[0m     dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    216\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m    217\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m    218\u001b[0m ):\n\u001b[1;32m    219\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m    224\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m    225\u001b[0m )\n\u001b[1;32m    226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m    180\u001b[0m         dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m    181\u001b[0m             dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    182\u001b[0m         )\n\u001b[1;32m    183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m    186\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m    187\u001b[0m         dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m    188\u001b[0m     )\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:101\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m     99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexists \u001b[38;5;129;01mand\u001b[39;00m require:\n\u001b[1;32m    100\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39murl \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    102\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    103\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgenerate()\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:343\u001b[0m, in \u001b[0;36mDataset.download\u001b[0;34m(self, url, version)\u001b[0m\n\u001b[1;32m    341\u001b[0m response \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mget(url, headers\u001b[38;5;241m=\u001b[39mauth_headers)\n\u001b[1;32m    342\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[0;32m--> 343\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    344\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid response code \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for url \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    345\u001b[0m     )\n\u001b[1;32m    346\u001b[0m assets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massets\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m    347\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m asset \u001b[38;5;129;01min\u001b[39;00m assets:\n",
-      "\u001b[0;31mValueError\u001b[0m: Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0."
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
     "\n",
@@ -276,11 +267,19 @@
     "\n",
     "data.save_dataset(output_path)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4cf8e89",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "policyengine-us-data",
+   "display_name": "pe",
    "language": "python",
    "name": "python3"
   },
@@ -294,7 +293,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.13"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,

From a042a01f7826997d0ac99b330183b80cfee167df Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 13:44:20 -0400
Subject: [PATCH 11/58] added CPS_2023 to lite mode generation

---
 changelog_entry.yaml                     | 6 +++---
 policyengine_us_data/datasets/cps/cps.py | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index ac664753..dcce3f1a 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,4 +1,4 @@
-- bump: minor
+- bump: patch
   changes:
-    added:
-    - Enhanced CPS minimizing tests. 
\ No newline at end of file
+    changed:
+    - lite mode now builds CPS_2023 in addition to CPS_2024
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 3b976a31..fde981ba 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2006,6 +2006,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
 
 if __name__ == "__main__":
     if test_lite:
+        CPS_2023().generate()
         CPS_2024().generate()
     else:
         CPS_2021().generate()

From cabeb56c7a1fe926eaf4c5aa5ecd26f45df3043f Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 14:54:23 -0400
Subject: [PATCH 12/58] Fixed manual test

---
 .github/workflows/code_changes.yaml      | 1 +
 .github/workflows/manual_tests.yaml      | 2 +-
 policyengine_us_data/datasets/cps/cps.py | 1 +
 pyproject.toml                           | 4 ++--
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml
index 6b474227..edd804db 100644
--- a/.github/workflows/code_changes.yaml
+++ b/.github/workflows/code_changes.yaml
@@ -2,6 +2,7 @@
 
 name: Code changes
 on:
+  workflow_call:
   push:
     branches:
       - main
diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml
index a2daca18..fb13ba89 100644
--- a/.github/workflows/manual_tests.yaml
+++ b/.github/workflows/manual_tests.yaml
@@ -11,7 +11,7 @@ on:
 
 jobs:
   test:
-    uses: ./.github/workflows/pr_changelog.yaml
+    uses: ./.github/workflows/code_changes.yaml
     with:
       TEST_LITE: ${{ github.event.inputs.test_lite }}
     secrets: inherit
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index fde981ba..177f4707 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2008,6 +2008,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
+        print(2 + 2)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/pyproject.toml b/pyproject.toml
index 65d1ca8e..3490ff1b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,8 +17,8 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us>=1.340.0",
-    "policyengine-core>=3.17.1",
+    "policyengine-us>=1.333.0",
+    "policyengine-core>=3.14.1",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",

From 7b76afba9eb55c3d2588c1ba5c6683a48e3709f7 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 15:02:22 -0400
Subject: [PATCH 13/58] try again with locked version

---
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 pyproject.toml                           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 177f4707..09a594c3 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2008,7 +2008,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 2)
+        print(2 + 3)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/pyproject.toml b/pyproject.toml
index 3490ff1b..74af05bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us>=1.333.0",
+    "policyengine-us==1.333.0",
     "policyengine-core>=3.14.1",
     "requests",
     "tqdm",

From 4056df4762b5d5e98ff6da815eae8de1484a4c25 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 15:44:32 -0400
Subject: [PATCH 14/58] trying things

---
 policyengine_us_data/datasets/cps/cps.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 09a594c3..1edce6e9 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -100,9 +100,14 @@ def downsample(self, frac: float):
         original_dtypes = {
             key: original_data[key].dtype for key in original_data
         }
-
+        print("\n\nHERE IS THE PROBLEM-----")
+        print(f"frac is {frac}")
+        print(self)
+        print(Microsimulation)
         sim = Microsimulation(dataset=self)
-        sim.subsample(frac=frac)
+        print(sim)
+        print(sim.subsample)
+        #sim.subsample(frac=frac)
 
         for key in original_data:
             if key not in sim.tax_benefit_system.variables:

From 96c4c25b71b5e148059be66a28805ad41c8cc28b Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 15:45:47 -0400
Subject: [PATCH 15/58] lint

---
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 1edce6e9..30688719 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -107,7 +107,7 @@ def downsample(self, frac: float):
         sim = Microsimulation(dataset=self)
         print(sim)
         print(sim.subsample)
-        #sim.subsample(frac=frac)
+        # sim.subsample(frac=frac)
 
         for key in original_data:
             if key not in sim.tax_benefit_system.variables:

From e20c75c202531e72fd118107c40fa10a0cda6e79 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 16:05:26 -0400
Subject: [PATCH 16/58] trying 3.11.12

---
 policyengine_us_data/datasets/cps/cps.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 30688719..8219e915 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -107,7 +107,7 @@ def downsample(self, frac: float):
         sim = Microsimulation(dataset=self)
         print(sim)
         print(sim.subsample)
-        # sim.subsample(frac=frac)
+        sim.subsample(frac=frac)
 
         for key in original_data:
             if key not in sim.tax_benefit_system.variables:
@@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 3)
+        print(2 + 5)
     else:
         CPS_2021().generate()
         CPS_2022().generate()

From 776eda8ce513f7e1b845cb8212abd17301e46c73 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 16:10:26 -0400
Subject: [PATCH 17/58] now actually specifying py version

---
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 8219e915..a25aba26 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 5)
+        print(2 + 7)
     else:
         CPS_2021().generate()
         CPS_2022().generate()

From cd771794473e0bb1f5005e7d6c598d8c1bc2a112 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 16:33:21 -0400
Subject: [PATCH 18/58] pandas v

---
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 pyproject.toml                           | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index a25aba26..b3554604 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 7)
+        print(2 + 8)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/pyproject.toml b/pyproject.toml
index 74af05bf..6c767ede 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ requires-python = ">=3.11, <3.13.0"
 dependencies = [
     "policyengine-us==1.333.0",
     "policyengine-core>=3.14.1",
+    "pandas==2.3.1",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",

From d0ce44db56b066e4d370bc434fba08435f65e01f Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 16:47:12 -0400
Subject: [PATCH 19/58] small runner

---
 .github/workflows/pr_code_changes.yaml   | 2 +-
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 213d192f..385e5a4c 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -49,7 +49,7 @@ jobs:
         run: python -c "from policyengine_core.data import Dataset; print('Core import OK')"
 
   Test:
-      runs-on: larger-runner
+      runs-on: ubuntu-latest 
       needs: Lint
       env:
         HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index b3554604..027c2ef5 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 8)
+        print(2 + 0)
     else:
         CPS_2021().generate()
         CPS_2022().generate()

From eb96cd5f706b0b718c39e36fa4fd1854bb3e3b0d Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 16:53:57 -0400
Subject: [PATCH 20/58] trying everything

---
 .github/workflows/pr_code_changes.yaml   | 2 +-
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 pyproject.toml                           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 385e5a4c..02209591 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -63,7 +63,7 @@ jobs:
           - name: Set up Python
             uses: actions/setup-python@v2
             with:
-                python-version: '3.11'
+                python-version: '3.11.12'
           - name: Install package
             run: uv pip install -e .[dev] --system
 
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 027c2ef5..afbf223f 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 0)
+        print(2 + 9)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/pyproject.toml b/pyproject.toml
index 6c767ede..d87290a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,11 +15,11 @@ authors = [
     {name = "PolicyEngine", email = "hello@policyengine.org"},
 ]
 license = {file = "LICENSE"}
-requires-python = ">=3.11, <3.13.0"
+requires-python = ">=3.11, <3.11.13"
 dependencies = [
     "policyengine-us==1.333.0",
     "policyengine-core>=3.14.1",
-    "pandas==2.3.1",
+    "pandas==2.3.0",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",

From 59ff94e82cd4dbd0aba16b488fd0b8ec16ca5531 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 17:02:45 -0400
Subject: [PATCH 21/58] relaxing python version in pyproject.toml

---
 policyengine_us_data/datasets/cps/cps.py | 2 +-
 pyproject.toml                           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index afbf223f..3173d4d6 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(2 + 9)
+        print(3 + 9)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/pyproject.toml b/pyproject.toml
index d87290a2..fe5fda52 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,7 @@ authors = [
     {name = "PolicyEngine", email = "hello@policyengine.org"},
 ]
 license = {file = "LICENSE"}
-requires-python = ">=3.11, <3.11.13"
+requires-python = ">=3.11, <3.13.0"
 dependencies = [
     "policyengine-us==1.333.0",
     "policyengine-core>=3.14.1",

From d3fa67bf98762b48c6fe2397275c1d0aac2ff77b Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Fri, 11 Jul 2025 17:29:12 -0400
Subject: [PATCH 22/58] putting things back in order.

---
 policyengine_us_data/datasets/cps/cps.py | 7 -------
 pyproject.toml                           | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 3173d4d6..d9957cbb 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -100,13 +100,7 @@ def downsample(self, frac: float):
         original_dtypes = {
             key: original_data[key].dtype for key in original_data
         }
-        print("\n\nHERE IS THE PROBLEM-----")
-        print(f"frac is {frac}")
-        print(self)
-        print(Microsimulation)
         sim = Microsimulation(dataset=self)
-        print(sim)
-        print(sim.subsample)
         sim.subsample(frac=frac)
 
         for key in original_data:
@@ -2013,7 +2007,6 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(3 + 9)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/pyproject.toml b/pyproject.toml
index fe5fda52..4bec19eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us==1.333.0",
+    "policyengine-us==1.340.1",
     "policyengine-core>=3.14.1",
     "pandas==2.3.0",
     "requests",

From 273c48d7bc9db1d6f06fa859897b63c30d37b044 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com>
Date: Sat, 12 Jul 2025 13:01:15 +0100
Subject: [PATCH 23/58] Use normal runner in PR tests

---
 .github/workflows/pr_code_changes.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 02209591..c84a4b97 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -49,7 +49,7 @@ jobs:
         run: python -c "from policyengine_core.data import Dataset; print('Core import OK')"
 
   Test:
-      runs-on: ubuntu-latest 
+      runs-on: ubuntu-latest
       needs: Lint
       env:
         HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}

From 8c2fbda847e9945878afa4085476f56895c360f1 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sat, 12 Jul 2025 09:53:07 -0400
Subject: [PATCH 24/58] added the 3.11.12 pin

---
 .github/workflows/code_changes.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml
index edd804db..c2340d14 100644
--- a/.github/workflows/code_changes.yaml
+++ b/.github/workflows/code_changes.yaml
@@ -38,7 +38,7 @@ jobs:
           - name: Set up Python
             uses: actions/setup-python@v2
             with:
-                python-version: '3.11'
+                python-version: '3.11.12'
           - uses: "google-github-actions/auth@v2"
             with:
               workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider"

From edb09456bb8548b8b4eb94136122ab5a5b33586e Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 20:00:50 -0400
Subject: [PATCH 25/58] cps.py

---
 policyengine_us_data/datasets/cps/cps.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index d9957cbb..202f9c69 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2007,6 +2007,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
+        print(3)
     else:
         CPS_2021().generate()
         CPS_2022().generate()

From 994ac15a636b99f951e205ecb3a861e72cdc3472 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 20:32:26 -0400
Subject: [PATCH 26/58] adding diagnostics

---
 .../datasets/cps/enhanced_cps.py                | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 6ad510f3..17d3e862 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -249,6 +249,23 @@ def generate(self):
             )
             data["household_weight"][year] = optimised_weights
 
+        print("\n\n---reweighting quick diagnostics----\n")
+        estimate = optimised_weights @ loss_matrix
+        rel_error = (
+            ((estimate - targets_array) + 1) / (targets_array + 1)
+        ) ** 2
+        print(
+            f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}",
+            f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}"
+        )
+        print("Relative error over 100% for:")
+        for i in np.where(rel_error > 1)[0]:
+            print(f"target_name: {loss_matrix.columns[i]}")
+            print(f"target_value: {targets_array[i]}")
+            print(f"estimate_value: {estimate[i]}")
+            print(f"has rel_error: {rel_error.values[i]:.2f}\n")
+        print("---End of reweighting quick diagnostics------")
+
         self.save_dataset(data)
 
 

From 341a3559f4368f65947db8f0ebe4db67e39a671c Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 20:32:47 -0400
Subject: [PATCH 27/58] lint

---
 policyengine_us_data/datasets/cps/enhanced_cps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 17d3e862..0da67ceb 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -256,7 +256,7 @@ def generate(self):
         ) ** 2
         print(
             f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}",
-            f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}"
+            f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}",
         )
         print("Relative error over 100% for:")
         for i in np.where(rel_error > 1)[0]:

From c2ab4b6466de68c8970ac859157bc941fc56287b Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 22:27:46 -0400
Subject: [PATCH 28/58] taking out bad targets

---
 policyengine_us_data/datasets/cps/cps.py      |  1 -
 .../datasets/cps/enhanced_cps.py              | 59 +++++++++++++++++--
 2 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 202f9c69..d9957cbb 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -2007,7 +2007,6 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
     if test_lite:
         CPS_2023().generate()
         CPS_2024().generate()
-        print(3)
     else:
         CPS_2021().generate()
         CPS_2022().generate()
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 0da67ceb..e7a57044 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -40,7 +40,7 @@ def reweight(
     targets_array,
     dropout_rate=0.05,
     log_path="calibration_log.csv",
-    penalty_approach=None,
+    epochs=150,
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -58,8 +58,8 @@ def reweight(
         np.log(original_weights), requires_grad=True, dtype=torch.float32
     )
 
-    # TO DO: replace this with a call to the python reweight.py package.
-    def loss(weights, penalty_approach=penalty_approach):
+    # TODO: replace this functionality from the microcalibrate package.
+    def loss(weights):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -129,7 +129,7 @@ def dropout_weights(weights, p):
 
     start_loss = None
 
-    iterator = trange(500)
+    iterator = trange(epochs)
     performance = pd.DataFrame()
     for i in iterator:
         optimizer.zero_grad()
@@ -229,13 +229,37 @@ def generate(self):
         original_weights = original_weights.values + np.random.normal(
             1, 0.1, len(original_weights)
         )
+
+        bad_targets = [
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "state/RI/adjusted_gross_income/amount/-inf_1",
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "state/RI/adjusted_gross_income/amount/-inf_1",
+            "target_name: nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All",
+        ]
+
+        # Run the optimization procedure to get (close to) minimum loss weights
         for year in range(self.start_year, self.end_year + 1):
             loss_matrix, targets_array = build_loss_matrix(
                 self.input_dataset, year
             )
-
+            zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
             bad_mask = loss_matrix.columns.isin(bad_targets)
-            keep_mask_bool = ~bad_mask
+            keep_mask_bool = ~(zero_mask | bad_mask)
             keep_idx = np.where(keep_mask_bool)[0]
             loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
             targets_array_clean = targets_array[keep_idx]
@@ -245,10 +269,33 @@ def generate(self):
                 original_weights,
                 loss_matrix_clean,
                 targets_array_clean,
+                loss_matrix_clean,
+                targets_array_clean,
                 log_path="calibration_log.csv",
+                epochs=150,
             )
             data["household_weight"][year] = optimised_weights
 
+            print("\n\n---reweighting quick diagnostics----\n")
+            estimate = optimised_weights @ loss_matrix_clean
+            rel_error = (
+                ((estimate - targets_array_clean) + 1)
+                / (targets_array_clean + 1)
+            ) ** 2
+            print(
+                f"rel_error: min: {np.min(rel_error):.2f}, "
+                f"max: {np.max(rel_error):.2f} "
+                f"mean: {np.mean(rel_error):.2f}, "
+                f"median: {np.median(rel_error):.2f}"
+            )
+            print("Relative error over 100% for:")
+            for i in np.where(rel_error > 1)[0]:
+                print(f"target_name: {loss_matrix_clean.columns[i]}")
+                print(f"target_value: {targets_array_clean[i]}")
+                print(f"estimate_value: {estimate[i]}")
+                print(f"has rel_error: {rel_error[i]:.2f}\n")
+            print("---End of reweighting quick diagnostics------")
+
         print("\n\n---reweighting quick diagnostics----\n")
         estimate = optimised_weights @ loss_matrix
         rel_error = (

From 6f7a03a76dc95d7f9ebfd20f1df6240bd11593bc Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 23:05:09 -0400
Subject: [PATCH 29/58] fixing workflow arg passthrough

---
 .github/workflows/pr_code_changes.yaml | 16 +++++++++++++---
 changelog_entry.yaml                   |  6 ++++++
 pyproject.toml                         |  4 ++--
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index c84a4b97..56224a2e 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -11,6 +11,14 @@ on:
       - tests/**
       - .github/workflows/**
 
+  workflow_call:
+    inputs:
+      TEST_LITE:
+        description: 'Run in lite mode'
+        type: boolean
+        required: false
+        default: false
+
 jobs:
   Lint:
     runs-on: ubuntu-latest
@@ -53,6 +61,7 @@ jobs:
       needs: Lint
       env:
         HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
+        TEST_LITE: ${{ inputs.TEST_LITE }}
       steps:
           - name: Checkout repo
             uses: actions/checkout@v2
@@ -63,7 +72,7 @@ jobs:
           - name: Set up Python
             uses: actions/setup-python@v2
             with:
-                python-version: '3.11.12'
+                python-version: '3.11'
           - name: Install package
             run: uv pip install -e .[dev] --system
 
@@ -75,8 +84,9 @@ jobs:
           - name: Build datasets
             run: make data
             env:
-              TEST_LITE: true
-              PYTHON_LOG_LEVEL: INFO  
+              TEST_LITE: ${{ env.TEST_LITE }}
+              PYTHON_LOG_LEVEL: INFO
+
           - name: Save calibration log
             uses: actions/upload-artifact@v4
             with:
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index dcce3f1a..bce8b349 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,4 +1,10 @@
 - bump: patch
   changes:
     changed:
+    - bad targets (causing problems with estimation) removed
     - lite mode now builds CPS_2023 in addition to CPS_2024
+    - gave reweight an epochs argument and set it at 150 for optimization
+    - updating minimum versions on policyengine-us and pandas dependencies
+    fixed:
+    - manual workflow now can call PR code changes
+
diff --git a/pyproject.toml b/pyproject.toml
index 4bec19eb..481cbc37 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,9 +17,9 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us==1.340.1",
+    "policyengine-us>=1.340.1",
     "policyengine-core>=3.14.1",
-    "pandas==2.3.0",
+    "pandas>=2.3.0",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",

From 3dba2a2aa3a578aeaa7e7acde71e53d150669036 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 23:09:32 -0400
Subject: [PATCH 30/58] deps and defaults

---
 .github/workflows/code_changes.yaml    | 2 +-
 .github/workflows/pr_code_changes.yaml | 2 +-
 pyproject.toml                         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml
index c2340d14..edd804db 100644
--- a/.github/workflows/code_changes.yaml
+++ b/.github/workflows/code_changes.yaml
@@ -38,7 +38,7 @@ jobs:
           - name: Set up Python
             uses: actions/setup-python@v2
             with:
-                python-version: '3.11.12'
+                python-version: '3.11'
           - uses: "google-github-actions/auth@v2"
             with:
               workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider"
diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 56224a2e..1e05b564 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -17,7 +17,7 @@ on:
         description: 'Run in lite mode'
         type: boolean
         required: false
-        default: false
+        default: true
 
 jobs:
   Lint:
diff --git a/pyproject.toml b/pyproject.toml
index 481cbc37..f983258d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
     "policyengine-us>=1.340.1",
-    "policyengine-core>=3.14.1",
+    "policyengine-core>=3.17.1",
     "pandas>=2.3.0",
     "requests",
     "tqdm",

From 7710a4cd0f58de7b2120f146228977e9c46f253d Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 23:12:21 -0400
Subject: [PATCH 31/58] wrong pipeline for manual test

---
 .github/workflows/manual_tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml
index fb13ba89..fd6fa061 100644
--- a/.github/workflows/manual_tests.yaml
+++ b/.github/workflows/manual_tests.yaml
@@ -11,7 +11,7 @@ on:
 
 jobs:
   test:
-    uses: ./.github/workflows/code_changes.yaml
+    uses: ./.github/workflows/pr_code_changes.yaml
     with:
       TEST_LITE: ${{ github.event.inputs.test_lite }}
     secrets: inherit

From 27f46fd8d19199fad6006675bcab231da67968af Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 23:30:46 -0400
Subject: [PATCH 32/58] trying again to get the manual test to work

---
 .github/workflows/manual_tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml
index fd6fa061..55667dbc 100644
--- a/.github/workflows/manual_tests.yaml
+++ b/.github/workflows/manual_tests.yaml
@@ -13,5 +13,5 @@ jobs:
   test:
     uses: ./.github/workflows/pr_code_changes.yaml
     with:
-      TEST_LITE: ${{ github.event.inputs.test_lite }}
+      TEST_LITE: ${{ inputs.test_lite }}
     secrets: inherit

From fef1eca57d99d8359f335ac4886eebde5b45c6c9 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Sun, 13 Jul 2025 23:53:27 -0400
Subject: [PATCH 33/58] reverting to older workflow code

---
 .github/workflows/manual_tests.yaml    | 17 -----------------
 .github/workflows/pr_code_changes.yaml | 14 ++------------
 changelog_entry.yaml                   |  4 +---
 3 files changed, 3 insertions(+), 32 deletions(-)
 delete mode 100644 .github/workflows/manual_tests.yaml

diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml
deleted file mode 100644
index 55667dbc..00000000
--- a/.github/workflows/manual_tests.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: Manual tests
-
-on:
-  workflow_dispatch:
-    inputs:
-      test_lite:
-        description: 'Run in lite mode'
-        required: true
-        default: true
-        type: boolean
-
-jobs:
-  test:
-    uses: ./.github/workflows/pr_code_changes.yaml
-    with:
-      TEST_LITE: ${{ inputs.test_lite }}
-    secrets: inherit
diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
index 1e05b564..4e30d089 100644
--- a/.github/workflows/pr_code_changes.yaml
+++ b/.github/workflows/pr_code_changes.yaml
@@ -11,14 +11,6 @@ on:
       - tests/**
       - .github/workflows/**
 
-  workflow_call:
-    inputs:
-      TEST_LITE:
-        description: 'Run in lite mode'
-        type: boolean
-        required: false
-        default: true
-
 jobs:
   Lint:
     runs-on: ubuntu-latest
@@ -61,7 +53,6 @@ jobs:
       needs: Lint
       env:
         HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
-        TEST_LITE: ${{ inputs.TEST_LITE }}
       steps:
           - name: Checkout repo
             uses: actions/checkout@v2
@@ -84,9 +75,8 @@ jobs:
           - name: Build datasets
             run: make data
             env:
-              TEST_LITE: ${{ env.TEST_LITE }}
-              PYTHON_LOG_LEVEL: INFO
-
+              TEST_LITE: true
+              PYTHON_LOG_LEVEL: INFO  
           - name: Save calibration log
             uses: actions/upload-artifact@v4
             with:
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index bce8b349..3f9b8627 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -5,6 +5,4 @@
     - lite mode now builds CPS_2023 in addition to CPS_2024
     - gave reweight an epochs argument and set it at 150 for optimization
     - updating minimum versions on policyengine-us and pandas dependencies
-    fixed:
-    - manual workflow now can call PR code changes
-
+    - getting rid of non-working manual workflow code

From 5eb10501cd4e8f33925411de7f4574e3dec413f8 Mon Sep 17 00:00:00 2001
From: baogorek <baogorek@gmail.com>
Date: Mon, 14 Jul 2025 00:12:37 -0400
Subject: [PATCH 34/58] cleaning up enhanced_cps.py

---
 .../datasets/cps/enhanced_cps.py              | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index e7a57044..5c82d724 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -249,7 +249,7 @@ def generate(self):
             "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
             "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
             "state/RI/adjusted_gross_income/amount/-inf_1",
-            "target_name: nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All",
+            "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All",
         ]
 
         # Run the optimization procedure to get (close to) minimum loss weights
@@ -296,23 +296,6 @@ def generate(self):
                 print(f"has rel_error: {rel_error[i]:.2f}\n")
             print("---End of reweighting quick diagnostics------")
 
-        print("\n\n---reweighting quick diagnostics----\n")
-        estimate = optimised_weights @ loss_matrix
-        rel_error = (
-            ((estimate - targets_array) + 1) / (targets_array + 1)
-        ) ** 2
-        print(
-            f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}",
-            f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}",
-        )
-        print("Relative error over 100% for:")
-        for i in np.where(rel_error > 1)[0]:
-            print(f"target_name: {loss_matrix.columns[i]}")
-            print(f"target_value: {targets_array[i]}")
-            print(f"estimate_value: {estimate[i]}")
-            print(f"has rel_error: {rel_error.values[i]:.2f}\n")
-        print("---End of reweighting quick diagnostics------")
-
         self.save_dataset(data)
 
 

From 1fb4318b21072a9c5dbd2824216be49655f0b9b2 Mon Sep 17 00:00:00 2001
From: MaxGhenis <MaxGhenis@users.noreply.github.com>
Date: Mon, 14 Jul 2025 15:33:13 +0000
Subject: [PATCH 35/58] Update package version

---
 CHANGELOG.md         | 11 +++++++++++
 changelog.yaml       |  9 +++++++++
 changelog_entry.yaml |  8 --------
 pyproject.toml       |  2 +-
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6299d8fb..e355d4dd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.37.1] - 2025-07-14 15:33:11
+
+### Changed
+
+- bad targets (causing problems with estimation) removed
+- lite mode now builds CPS_2023 in addition to CPS_2024
+- gave reweight an epochs argument and set it at 150 for optimization
+- updating minimum versions on policyengine-us and pandas dependencies
+- getting rid of non-working manual workflow code
+
 ## [1.37.0] - 2025-07-09 14:58:33
 
 ### Added
@@ -520,6 +530,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[1.37.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.37.0...1.37.1
 [1.37.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.2...1.37.0
 [1.36.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.1...1.36.2
 [1.36.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.0...1.36.1
diff --git a/changelog.yaml b/changelog.yaml
index 699b2430..af7cdf32 100644
--- a/changelog.yaml
+++ b/changelog.yaml
@@ -433,3 +433,12 @@
     added:
     - Medicaid state level calibration targets.
   date: 2025-07-09 14:58:33
+- bump: patch
+  changes:
+    changed:
+    - bad targets (causing problems with estimation) removed
+    - lite mode now builds CPS_2023 in addition to CPS_2024
+    - gave reweight an epochs argument and set it at 150 for optimization
+    - updating minimum versions on policyengine-us and pandas dependencies
+    - getting rid of non-working manual workflow code
+  date: 2025-07-14 15:33:11
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 3f9b8627..e69de29b 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,8 +0,0 @@
-- bump: patch
-  changes:
-    changed:
-    - bad targets (causing problems with estimation) removed
-    - lite mode now builds CPS_2023 in addition to CPS_2024
-    - gave reweight an epochs argument and set it at 150 for optimization
-    - updating minimum versions on policyengine-us and pandas dependencies
-    - getting rid of non-working manual workflow code
diff --git a/pyproject.toml b/pyproject.toml
index f983258d..5a75693f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "policyengine_us_data"
-version = "1.37.0"
+version = "1.37.1"
 description = "A package to create representative microdata for the US."
 readme = "README.md"
 authors = [

From a62328a6f47293f90e1e696d03b49b96c044321b Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 11 Jul 2025 16:24:22 +0200
Subject: [PATCH 36/58] attempting to vectorize minimizing of ecps

---
 changelog_entry.yaml                          |  4 ++
 .../datasets/cps/enhanced_cps.py              | 53 +++++++------------
 policyengine_us_data/utils/minimise.py        | 51 ++++++++++++------
 3 files changed, 59 insertions(+), 49 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e69de29b..84eeb584 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+    - Enhanced CPS minimizing tests.
\ No newline at end of file
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 5c82d724..6616d54c 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -58,8 +58,8 @@ def reweight(
         np.log(original_weights), requires_grad=True, dtype=torch.float32
     )
 
-    # TODO: replace this functionality from the microcalibrate package.
-    def loss(weights):
+    # TO DO: replace this with a call to the python reweight.py package.
+    def loss(weights, penalty_approach="l0_sigmoid"):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -76,43 +76,30 @@ def loss(weights):
         if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
 
-        if penalty_approach is not None:
-            # L0 penalty (approximated with smooth function)
-            # Since L0 is non-differentiable, we use a smooth approximation
-            # Common approaches:
+        # L0 penalty (approximated with smooth function)
+        # Since L0 is non-differentiable, we use a smooth approximation
+        # Common approaches:
 
-            epsilon = 1e-3  # Threshold for "near zero"
-            l0_penalty_weight = 1e-1  # Adjust this hyperparameter
+        epsilon = 1e-3  # Threshold for "near zero"
+        l0_penalty_weight = 1e-1  # Adjust this hyperparameter
 
-            # Option 1: Sigmoid approximation
-            if penalty_approach == "l0_sigmoid":
-                smoothed_l0 = torch.sigmoid(
-                    (weights - epsilon) / (epsilon * 0.1)
-                ).mean()
+        # Option 1: Sigmoid approximation
+        if penalty_approach == "l0_sigmoid":
+            smoothed_l0 = torch.sigmoid(
+                (weights - epsilon) / (epsilon * 0.1)
+            ).mean()
 
-            # Option 2: Log-sum penalty (smoother)
-            if penalty_approach == "l0_log":
-                smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(
-                    weights
-                )
+        # Option 2: Log-sum penalty (smoother)
+        if penalty_approach == "l0_log":
+            smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights)
 
-            # Option 3: Exponential penalty
-            if penalty_approach == "l0_exp":
-                smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
+        # Option 3: Exponential penalty
+        if penalty_approach == "l0_exp":
+            smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
 
-            # L1 penalty
-            l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
+        # L1 penalty
 
-            if penalty_approach == "l1":
-                l1 = torch.mean(weights)
-                return rel_error_normalized.mean() + l1_penalty_weight * l1
-
-            return (
-                rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
-            )
-
-        else:
-            return rel_error_normalized.mean()
+        return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
 
     def dropout_weights(weights, p):
         if p == 0:
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index df193c6e..ca985378 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -5,21 +5,10 @@
 import pandas as pd
 import h5py
 from policyengine_us_data.storage import STORAGE_FOLDER
-from typing import Optional, Callable
+from typing import Optional
 
-bad_targets = [
-    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
-    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
-    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
-    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
-    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
-    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
-    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
-    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
-]
 
-
-def create_calibration_log_file(file_path, epoch=0):
+def create_calibration_log_file(file_path):
     dataset = Dataset.from_file(file_path)
 
     loss_matrix, targets = build_loss_matrix(dataset, 2024)
@@ -112,6 +101,27 @@ def losses_for_candidates(
     return losses
 
 
+def minimise_dataset(
+    dataset, output_path: str, loss_rel_change_max: float
+) -> None:
+    dataset = str(dataset)
+    create_calibration_log_file(dataset)
+
+    dataset = Dataset.from_file(dataset)
+    loss_matrix = build_loss_matrix(dataset, 2024)
+
+    sim = Microsimulation(dataset=dataset)
+
+    weights = sim.calculate("household_weight", 2024).values
+    estimate_matrix, targets = loss_matrix
+    is_national = estimate_matrix.columns.str.startswith("nation/")
+    nation_normalisation_factor = is_national * (1 / is_national.sum())
+    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
+    normalisation_factor = np.where(
+        is_national, nation_normalisation_factor, state_normalisation_factor
+    )
+    weights @ estimate_matrix
+
 def get_loss_from_mask(
     weights, inclusion_mask, estimate_matrix, targets, normalisation_factor
 ):
@@ -185,16 +195,25 @@ def candidate_loss_contribution(
             replace=False,
         )
 
-        # Compute losses for the batch in one shot
+        # more efficient approach to compute losses for candidate households to be removed
+
+        # 1. sample only households that are currently *included*
+        indices = np.random.choice(
+            np.where(full_mask)[0],
+            size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION),
+            replace=False,
+        )
+        # 2. compute losses for the batch in one shot
         candidate_losses = losses_for_candidates(
             weights, indices, estimate_matrix, targets, normalisation_factor
         )
-
-        # Convert to relative change vs. baseline
+        # 3. convert to relative change vs. baseline
         household_loss_rel_changes = (
             candidate_losses - baseline_loss
         ) / baseline_loss
 
+        inclusion_mask = full_mask.copy()
+        household_loss_rel_changes = np.array(household_loss_rel_changes)
         # Sort by the relative change in loss
         sorted_indices = np.argsort(household_loss_rel_changes)
 

From 6d3f8b4daea6ab498b105bf9429b74e52462cde4 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 11 Jul 2025 19:19:58 +0200
Subject: [PATCH 37/58] add notebook with testing functionality (havent tested
 locally)

---
 .../datasets/cps/enhanced_cps.py              |   9 +-
 policyengine_us_data/utils/minimise.py        |   2 +-
 test_minimization_approach.ipynb              | 210 +-----------------
 3 files changed, 16 insertions(+), 205 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 6616d54c..ca53a84d 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -40,7 +40,7 @@ def reweight(
     targets_array,
     dropout_rate=0.05,
     log_path="calibration_log.csv",
-    epochs=150,
+    penalty_approach="l0_sigmoid",
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -59,7 +59,7 @@ def reweight(
     )
 
     # TO DO: replace this with a call to the python reweight.py package.
-    def loss(weights, penalty_approach="l0_sigmoid"):
+    def loss(weights, penalty_approach=penalty_approach):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -98,6 +98,11 @@ def loss(weights, penalty_approach="l0_sigmoid"):
             smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
 
         # L1 penalty
+        l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
+
+        if penalty_approach == "l1":
+            l1 = torch.mean(weights)
+            return rel_error_normalized.mean() + l1_penalty_weight * l1
 
         return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
 
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index ca985378..da2cb7d1 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -386,7 +386,7 @@ def minimise_dataset(
                 f.create_dataset(f"{variable}/{year}", data=value)
 
     print(f"Saved minimised dataset to {output_path}")
-    create_calibration_log_file(output_path, epoch=500)
+    create_calibration_log_file(output_path)    
 
 
 if __name__ == "__main__":
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 54f3c6fa..519d2725 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 5,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -12,199 +12,15 @@
     "from policyengine_us import Microsimulation\n",
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
-    "import numpy as np\n",
-    "import os"
+    "import numpy as np"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6daabe7c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Original ECPS 2024 dataset size (for household entity): 41310\n",
-    "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n",
-    "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
    "id": "db975ac1",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n",
-      "Weight relative change: 99.10%\n",
-      "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
     "\n",
@@ -212,20 +28,18 @@
     "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
     "    ]\n",
     "\n",
-    "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n",
-    "minimization_function = candidate_loss_contribution\n",
-    "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
+    "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n",
+    "minimization_function = random_sampling_minimization\n",
+    "# other minimization function approach is \"candidate_loss_contribution\"\n",
     "\n",
     "for file in files:\n",
     "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
-    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "\n",
     "    minimise_dataset(\n",
     "        file,\n",
     "        output_path,\n",
+    "        loss_rel_change_max=10,\n",
     "        minimization_function=minimization_function, \n",
-    "        # target_fractions=[0.5] # remove if switching approach\n",
-    "        loss_rel_change_max=0.0001,  # remove if switching approach\n",
+    "        target_fractions=[0.5] # remove if switching approach\n",
     "    )"
    ]
   },
@@ -267,14 +81,6 @@
     "\n",
     "data.save_dataset(output_path)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b4cf8e89",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

From 94cacdeab007e318fe849bb3bbf4b29d7fcf627a Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Mon, 14 Jul 2025 15:22:58 +0200
Subject: [PATCH 38/58] few updates to the testing framework

---
 changelog_entry.yaml                          |   2 +-
 .../datasets/cps/enhanced_cps.py              |  58 ++---
 policyengine_us_data/utils/minimise.py        |  59 +++++-
 pyproject.toml                                |   3 +-
 test_minimization_approach.ipynb              | 198 +++++++++++++++++-
 5 files changed, 280 insertions(+), 40 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 84eeb584..ac664753 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,4 +1,4 @@
 - bump: minor
   changes:
     added:
-    - Enhanced CPS minimizing tests.
\ No newline at end of file
+    - Enhanced CPS minimizing tests. 
\ No newline at end of file
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index ca53a84d..bf4b5501 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -40,7 +40,7 @@ def reweight(
     targets_array,
     dropout_rate=0.05,
     log_path="calibration_log.csv",
-    penalty_approach="l0_sigmoid",
+    penalty_approach=None,
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -76,35 +76,43 @@ def loss(weights, penalty_approach=penalty_approach):
         if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
 
-        # L0 penalty (approximated with smooth function)
-        # Since L0 is non-differentiable, we use a smooth approximation
-        # Common approaches:
+        if penalty_approach is not None:
+            # L0 penalty (approximated with smooth function)
+            # Since L0 is non-differentiable, we use a smooth approximation
+            # Common approaches:
 
-        epsilon = 1e-3  # Threshold for "near zero"
-        l0_penalty_weight = 1e-1  # Adjust this hyperparameter
+            epsilon = 1e-3  # Threshold for "near zero"
+            l0_penalty_weight = 1e-1  # Adjust this hyperparameter
 
-        # Option 1: Sigmoid approximation
-        if penalty_approach == "l0_sigmoid":
-            smoothed_l0 = torch.sigmoid(
-                (weights - epsilon) / (epsilon * 0.1)
-            ).mean()
+            # Option 1: Sigmoid approximation
+            if penalty_approach == "l0_sigmoid":
+                smoothed_l0 = torch.sigmoid(
+                    (weights - epsilon) / (epsilon * 0.1)
+                ).mean()
 
-        # Option 2: Log-sum penalty (smoother)
-        if penalty_approach == "l0_log":
-            smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights)
+            # Option 2: Log-sum penalty (smoother)
+            if penalty_approach == "l0_log":
+                smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(
+                    weights
+                )
 
-        # Option 3: Exponential penalty
-        if penalty_approach == "l0_exp":
-            smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
+            # Option 3: Exponential penalty
+            if penalty_approach == "l0_exp":
+                smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
 
-        # L1 penalty
-        l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
+            # L1 penalty
+            l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
 
-        if penalty_approach == "l1":
-            l1 = torch.mean(weights)
-            return rel_error_normalized.mean() + l1_penalty_weight * l1
+            if penalty_approach == "l1":
+                l1 = torch.mean(weights)
+                return rel_error_normalized.mean() + l1_penalty_weight * l1
 
-        return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
+            return (
+                rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
+            )
+
+        else:
+            return rel_error_normalized.mean()
 
     def dropout_weights(weights, p):
         if p == 0:
@@ -249,9 +257,9 @@ def generate(self):
             loss_matrix, targets_array = build_loss_matrix(
                 self.input_dataset, year
             )
-            zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
+
             bad_mask = loss_matrix.columns.isin(bad_targets)
-            keep_mask_bool = ~(zero_mask | bad_mask)
+            keep_mask_bool = ~bad_mask
             keep_idx = np.where(keep_mask_bool)[0]
             loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
             targets_array_clean = targets_array[keep_idx]
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index da2cb7d1..9c3d59eb 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -5,14 +5,33 @@
 import pandas as pd
 import h5py
 from policyengine_us_data.storage import STORAGE_FOLDER
-from typing import Optional
+from typing import Optional, Callable
 
+bad_targets = [
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+]
 
-def create_calibration_log_file(file_path):
+
+def create_calibration_log_file(file_path, epoch=0):
     dataset = Dataset.from_file(file_path)
 
     loss_matrix, targets = build_loss_matrix(dataset, 2024)
 
+    bad_mask = loss_matrix.columns.isin(bad_targets)
+    keep_mask_bool = ~bad_mask
+    keep_idx = np.where(keep_mask_bool)[0]
+    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+    targets_clean = targets[keep_idx]
+    assert loss_matrix_clean.shape[1] == targets_clean.size
+    loss_matrix, targets = build_loss_matrix(dataset, 2024)
+
     bad_mask = loss_matrix.columns.isin(bad_targets)
     keep_mask_bool = ~bad_mask
     keep_idx = np.where(keep_mask_bool)[0]
@@ -22,6 +41,10 @@ def create_calibration_log_file(file_path):
 
     sim = Microsimulation(dataset=dataset)
 
+    estimates = (
+        sim.calculate("household_weight", 2024).values @ loss_matrix_clean
+    )
+    target_names = loss_matrix_clean.columns
     estimates = (
         sim.calculate("household_weight", 2024).values @ loss_matrix_clean
     )
@@ -32,9 +55,11 @@ def create_calibration_log_file(file_path):
             "target_name": target_names,
             "estimate": estimates,
             "target": targets_clean,
+            "target": targets_clean,
         }
     )
     df["epoch"] = epoch
+    df["epoch"] = epoch
     df["error"] = df["estimate"] - df["target"]
     df["rel_error"] = df["error"] / df["target"]
     df["abs_error"] = df["error"].abs()
@@ -43,6 +68,11 @@ def create_calibration_log_file(file_path):
         if df["target"].abs().sum() > 0
         else np.nan
     )
+    df["rel_abs_error"] = (
+        df["abs_error"] / df["target"].abs()
+        if df["target"].abs().sum() > 0
+        else np.nan
+    )
     df["loss"] = (df["rel_error"] ** 2).mean()
 
     df.to_csv(
@@ -258,6 +288,7 @@ def random_sampling_minimization(
     targets,
     normalisation_factor,
     random=True,
+    random=True,
     target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5],
 ):
     """A simple random sampling approach"""
@@ -265,6 +296,8 @@ def random_sampling_minimization(
 
     household_weights_normalized = weights / weights.sum()
 
+    household_weights_normalized = weights / weights.sum()
+
     final_mask = None
     lowest_loss = float("inf")
     for fraction in target_fractions:
@@ -283,6 +316,14 @@ def random_sampling_minimization(
                     replace=False,
                 )
             ] = True
+            mask[
+                np.random.choice(
+                    n,
+                    target_size,
+                    p=household_weights_normalized if random else None,
+                    replace=False,
+                )
+            ] = True
 
             loss = get_loss_from_mask(
                 weights, mask, estimate_matrix, targets, normalisation_factor
@@ -322,6 +363,14 @@ def minimise_dataset(
     dataset = Dataset.from_file(dataset)
     loss_matrix, targets = build_loss_matrix(dataset, 2024)
 
+    bad_mask = loss_matrix.columns.isin(bad_targets)
+    keep_mask_bool = ~bad_mask
+    keep_idx = np.where(keep_mask_bool)[0]
+    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+    targets_clean = targets[keep_idx]
+    assert loss_matrix_clean.shape[1] == targets_clean.size
+    loss_matrix, targets = build_loss_matrix(dataset, 2024)
+
     bad_mask = loss_matrix.columns.isin(bad_targets)
     keep_mask_bool = ~bad_mask
     keep_idx = np.where(keep_mask_bool)[0]
@@ -333,6 +382,7 @@ def minimise_dataset(
 
     weights = sim.calculate("household_weight", 2024).values
     is_national = loss_matrix_clean.columns.str.startswith("nation/")
+    is_national = loss_matrix_clean.columns.str.startswith("nation/")
     nation_normalisation_factor = is_national * (1 / is_national.sum())
     state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
     normalisation_factor = np.where(
@@ -344,8 +394,11 @@ def minimise_dataset(
         weights=weights,
         estimate_matrix=loss_matrix_clean,
         targets=targets_clean,
+        estimate_matrix=loss_matrix_clean,
+        targets=targets_clean,
         normalisation_factor=normalisation_factor,
         **kwargs,  # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
+        **kwargs,  # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
     )
 
     # Extract household IDs for remaining households
@@ -386,7 +439,7 @@ def minimise_dataset(
                 f.create_dataset(f"{variable}/{year}", data=value)
 
     print(f"Saved minimised dataset to {output_path}")
-    create_calibration_log_file(output_path)    
+    create_calibration_log_file(output_path, epoch=500)
 
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
index 5a75693f..7f3e59b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,9 +17,8 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us>=1.340.1",
+    "policyengine-us>=1.340.0",
     "policyengine-core>=3.17.1",
-    "pandas>=2.3.0",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 519d2725..5a7a9d15 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -12,15 +12,188 @@
     "from policyengine_us import Microsimulation\n",
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "import os\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "db975ac1",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n",
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n",
+      "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n",
+      "Removed 4131 households with worst relative loss changes.\n",
+      "Weight relative change: 52.19%\n",
+      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n"
+     ]
+    }
+   ],
    "source": [
     "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
     "\n",
@@ -28,18 +201,17 @@
     "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
     "    ]\n",
     "\n",
-    "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n",
-    "minimization_function = random_sampling_minimization\n",
-    "# other minimization function approach is \"candidate_loss_contribution\"\n",
+    "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n",
+    "minimization_function = candidate_loss_contribution\n",
+    "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n",
     "\n",
     "for file in files:\n",
     "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
     "    minimise_dataset(\n",
     "        file,\n",
     "        output_path,\n",
-    "        loss_rel_change_max=10,\n",
     "        minimization_function=minimization_function, \n",
-    "        target_fractions=[0.5] # remove if switching approach\n",
+    "        #target_fractions=[0.5] # remove if switching approach\n",
     "    )"
    ]
   },
@@ -81,6 +253,14 @@
     "\n",
     "data.save_dataset(output_path)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4cf8e89",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From a71530b7b6f2723cfbf54a64f8f28f9d77e6da1d Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Mon, 14 Jul 2025 19:56:54 +0200
Subject: [PATCH 39/58] fix calibration for each approach

---
 .../datasets/cps/enhanced_cps.py              |  1 +
 policyengine_us_data/utils/loss.py            |  5 --
 policyengine_us_data/utils/minimise.py        | 89 ++++++++++++++-----
 test_minimization_approach.ipynb              | 86 ++++++++++++------
 4 files changed, 129 insertions(+), 52 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index bf4b5501..33f62929 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -39,6 +39,7 @@ def reweight(
     loss_matrix,
     targets_array,
     dropout_rate=0.05,
+    epochs=500,
     log_path="calibration_log.csv",
     penalty_approach=None,
 ):
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
index 21abce0f..fbdbacef 100644
--- a/policyengine_us_data/utils/loss.py
+++ b/policyengine_us_data/utils/loss.py
@@ -552,11 +552,6 @@ def build_loss_matrix(dataset: type, time_period):
         # Convert to thousands for the target
         targets_array.append(row["enrollment"])
 
-        print(
-            f"Targeting Medicaid enrollment for {row['state']} "
-            f"with target {row['enrollment']:.0f}k"
-        )
-
     # State 10-year age targets
 
     age_targets = pd.read_csv(STORAGE_FOLDER / "age_state.csv")
diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 9c3d59eb..84c55d31 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -6,6 +6,7 @@
 import h5py
 from policyengine_us_data.storage import STORAGE_FOLDER
 from typing import Optional, Callable
+from policyengine_us_data.datasets.cps.enhanced_cps import reweight
 
 bad_targets = [
     "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
@@ -20,35 +21,54 @@
 
 
 def create_calibration_log_file(file_path, epoch=0):
+    print(f"=== CALIBRATION LOG DEBUG ===")
+    print(f"File path: {file_path}")
+    print(f"Epoch: {epoch}")
+
     dataset = Dataset.from_file(file_path)
+    sim = Microsimulation(dataset=dataset)
 
-    loss_matrix, targets = build_loss_matrix(dataset, 2024)
+    # Debug: Print dataset info
+    household_weights = sim.calculate("household_weight", 2024)
+    print(f"Number of households: {len(household_weights)}")
+    print(f"Total weight: {household_weights.sum():.2f}")
+    print(
+        f"Weight range: {household_weights.min():.2f} to {household_weights.max():.2f}"
+    )
 
-    bad_mask = loss_matrix.columns.isin(bad_targets)
-    keep_mask_bool = ~bad_mask
-    keep_idx = np.where(keep_mask_bool)[0]
-    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
-    targets_clean = targets[keep_idx]
-    assert loss_matrix_clean.shape[1] == targets_clean.size
     loss_matrix, targets = build_loss_matrix(dataset, 2024)
+    print(f"Loss matrix shape: {loss_matrix.shape}")
+    print(f"Number of targets: {len(targets)}")
 
     bad_mask = loss_matrix.columns.isin(bad_targets)
     keep_mask_bool = ~bad_mask
     keep_idx = np.where(keep_mask_bool)[0]
     loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
     targets_clean = targets[keep_idx]
-    assert loss_matrix_clean.shape[1] == targets_clean.size
 
-    sim = Microsimulation(dataset=dataset)
+    print(f"After filtering bad targets:")
+    print(f"Loss matrix clean shape: {loss_matrix_clean.shape}")
+    print(f"Number of clean targets: {len(targets_clean)}")
+
+    assert loss_matrix_clean.shape[1] == targets_clean.size
 
     estimates = (
         sim.calculate("household_weight", 2024).values @ loss_matrix_clean
     )
     target_names = loss_matrix_clean.columns
-    estimates = (
-        sim.calculate("household_weight", 2024).values @ loss_matrix_clean
-    )
-    target_names = loss_matrix_clean.columns
+
+    # Debug: Print estimate statistics
+    print(f"Estimates shape: {estimates.shape}")
+    print(f"Estimates sum: {estimates.sum():.2f}")
+    print(f"First 3 estimates: {estimates[:3]}")
+    print(f"First 3 targets: {targets_clean[:3]}")
+
+    # Calculate and print some key metrics
+    errors = estimates - targets_clean
+    rel_errors = errors / targets_clean
+    print(f"Mean absolute error: {np.abs(errors).mean():.2f}")
+    print(f"Mean relative error: {np.abs(rel_errors).mean():.4f}")
+    print(f"=== END DEBUG ===\n")
 
     df = pd.DataFrame(
         {
@@ -158,6 +178,7 @@ def get_loss_from_mask(
     """
     Calculate the loss based on the inclusion mask and the estimate matrix.
     """
+    # Step 1: Apply mask and rescale weights
     masked_weights = weights.copy()
     original_weight_total = masked_weights.sum()
     if (~inclusion_mask).sum() > 0:
@@ -166,7 +187,26 @@ def get_loss_from_mask(
     masked_weights[inclusion_mask] *= (
         original_weight_total / masked_weight_total
     )
-    estimates = masked_weights @ estimate_matrix
+
+    # Step 2: Re-calibrate the masked weights to hit targets
+    # Only calibrate the included households
+    included_weights = masked_weights[inclusion_mask]
+    included_estimate_matrix = estimate_matrix[inclusion_mask]
+
+    # Call reweight function to calibrate the selected households
+    calibrated_weights_included = reweight(
+        included_weights,
+        included_estimate_matrix,
+        targets,
+        epochs=250,
+    )
+
+    # Put calibrated weights back into full array
+    calibrated_weights = np.zeros_like(masked_weights)
+    calibrated_weights[inclusion_mask] = calibrated_weights_included
+
+    # Calculate estimates and loss from calibrated weights
+    estimates = calibrated_weights @ estimate_matrix
     rel_error = ((estimates - targets) + 1) / (targets + 1)
     loss = ((rel_error * normalisation_factor) ** 2).mean()
 
@@ -288,8 +328,7 @@ def random_sampling_minimization(
     targets,
     normalisation_factor,
     random=True,
-    random=True,
-    target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5],
+    target_fractions=[0.5, 0.6, 0.7, 0.8, 0.9],
 ):
     """A simple random sampling approach"""
     n = len(weights)
@@ -306,7 +345,7 @@ def random_sampling_minimization(
         best_mask = None
         best_loss = float("inf")
 
-        for _ in range(5):  # Try 5 random samples
+        for _ in range(3):  # Try 3 random samples
             mask = np.zeros(n, dtype=bool)
             mask[
                 np.random.choice(
@@ -419,12 +458,20 @@ def minimise_dataset(
     sim = Microsimulation(dataset=smaller_df)
 
     # Rescale weights to maintain total
-    sim.set_input(
-        "household_weight",
-        2024,
-        sim.calculate("household_weight", 2024).values / weight_rel_change,
+    initial_weights = (
+        sim.calculate("household_weight", 2024).values / weight_rel_change
     )
 
+    # Re-calibrate the final selected households to hit targets
+    print("Re-calibrating final selected households...")
+    calibrated_weights = reweight(
+        initial_weights,
+        loss_matrix_clean.values,  # Convert to numpy array
+        targets_clean,
+        epochs=250,  # Reduced epochs for faster processing
+    )
+    sim.set_input("household_weight", 2024, calibrated_weights)
+    print("Final calibration completed successfully")
     # Prepare data for saving
     data = {}
     for variable in sim.input_variables:
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 5a7a9d15..6683da0c 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -13,12 +13,27 @@
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
     "import numpy as np\n",
-    "import os\n"
+    "import os"
    ]
   },
   {
    "cell_type": "code",
+<<<<<<< HEAD
    "execution_count": 7,
+=======
+   "execution_count": null,
+   "id": "6daabe7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Original ECPS 2024 dataset size (for household entity): 41310\n",
+    "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n",
+    "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "id": "db975ac1",
    "metadata": {},
    "outputs": [
@@ -128,18 +143,17 @@
       "Targeting Medicaid enrollment for WI with target 1108320k\n",
       "Targeting Medicaid enrollment for WV with target 467632k\n",
       "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n",
+      "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n",
+      "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n",
+      "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n",
+      "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n",
       "Removed 4131 households with worst relative loss changes.\n",
-      "Weight relative change: 52.19%\n",
-      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
+      "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n",
+      "Weight relative change: 99.10%\n",
+      "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
       "Targeting Medicaid enrollment for AK with target 231577k\n",
       "Targeting Medicaid enrollment for AL with target 766009k\n",
       "Targeting Medicaid enrollment for AR with target 733561k\n",
@@ -203,32 +217,38 @@
     "\n",
     "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n",
     "minimization_function = candidate_loss_contribution\n",
-    "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n",
+    "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
     "\n",
     "for file in files:\n",
     "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
     "    minimise_dataset(\n",
     "        file,\n",
     "        output_path,\n",
     "        minimization_function=minimization_function, \n",
-    "        #target_fractions=[0.5] # remove if switching approach\n",
+    "        # target_fractions=[0.5] # remove if switching approach\n",
+    "        loss_rel_change_max=0.0001,  # remove if switching approach\n",
     "    )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "35892c9d",
+   "execution_count": 4,
+   "id": "b4cf8e89",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 250/250 [01:24<00:00,  2.98it/s, loss=3.37e-5, loss_rel_change=-0.92] \n"
+     ]
+    }
+   ],
    "source": [
-    "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
-    "\n",
     "input_dataset = ExtendedCPS_2024\n",
     "\n",
-    "approach = \"l0_sigmoid\"\n",
-    "# other options are \"l0_log\", \"l0_exp\", \"l1\"\n",
-    "\n",
     "sim = Microsimulation(dataset=input_dataset)\n",
     "data = sim.dataset.load_dataset()\n",
     "data[\"household_weight\"] = {}\n",
@@ -240,18 +260,32 @@
     "    loss_matrix, targets_array = build_loss_matrix(\n",
     "        input_dataset, year\n",
     "    )\n",
+    "\n",
+    "    bad_mask = loss_matrix.columns.isin(bad_targets)\n",
+    "    keep_mask_bool = ~bad_mask\n",
+    "    keep_idx = np.where(keep_mask_bool)[0]\n",
+    "    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n",
+    "    targets_array_clean = targets_array[keep_idx]\n",
+    "    assert loss_matrix_clean.shape[1] == targets_array_clean.size\n",
+    "    assert loss_matrix_clean.shape[1] != loss_matrix.shape[1]\n",
+    "\n",
     "    optimised_weights = reweight(\n",
     "        original_weights,\n",
-    "        loss_matrix,\n",
-    "        targets_array,\n",
-    "        log_path= STORAGE_FOLDER / approach / \"calibration_log.csv\",\n",
-    "        penalty_approach=approach,\n",
+    "        loss_matrix_clean,\n",
+    "        targets_array_clean,\n",
+    "        log_path=\"baseline_calibration_log.csv\",\n",
+    "        epochs=250,  # Reduced epochs for faster processing\n",
     "    )\n",
     "    data[\"household_weight\"][year] = optimised_weights\n",
     "\n",
-    "output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "output_path = STORAGE_FOLDER / \"baseline\" / \"enhanced_cps_2024_baseline.h5\"\n",
+    "output_path.parent.mkdir(parents=True, exist_ok=True)\n",
     "\n",
-    "data.save_dataset(output_path)"
+    "# Save to HDF5 file\n",
+    "with h5py.File(output_path, \"w\") as f:\n",
+    "    for variable, values in data.items():\n",
+    "        for year, value in values.items():\n",
+    "            f.create_dataset(f\"{variable}/{year}\", data=value)"
    ]
   },
   {

From f146620a9c71761336d7b1c49ae5e54b09f100e4 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Mon, 14 Jul 2025 20:19:38 +0200
Subject: [PATCH 40/58] fixed testing framework

---
 policyengine_us_data/utils/minimise.py |  39 +--
 test_minimization_approach.ipynb       | 330 ++++++++++---------------
 2 files changed, 134 insertions(+), 235 deletions(-)

diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index 84c55d31..b3e0ed1a 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -75,11 +75,9 @@ def create_calibration_log_file(file_path, epoch=0):
             "target_name": target_names,
             "estimate": estimates,
             "target": targets_clean,
-            "target": targets_clean,
         }
     )
     df["epoch"] = epoch
-    df["epoch"] = epoch
     df["error"] = df["estimate"] - df["target"]
     df["rel_error"] = df["error"] / df["target"]
     df["abs_error"] = df["error"].abs()
@@ -88,11 +86,6 @@ def create_calibration_log_file(file_path, epoch=0):
         if df["target"].abs().sum() > 0
         else np.nan
     )
-    df["rel_abs_error"] = (
-        df["abs_error"] / df["target"].abs()
-        if df["target"].abs().sum() > 0
-        else np.nan
-    )
     df["loss"] = (df["rel_error"] ** 2).mean()
 
     df.to_csv(
@@ -172,6 +165,7 @@ def minimise_dataset(
     )
     weights @ estimate_matrix
 
+
 def get_loss_from_mask(
     weights, inclusion_mask, estimate_matrix, targets, normalisation_factor
 ):
@@ -264,15 +258,6 @@ def candidate_loss_contribution(
             size=int(full_mask.sum() * view_fraction_per_iteration),
             replace=False,
         )
-
-        # more efficient approach to compute losses for candidate households to be removed
-
-        # 1. sample only households that are currently *included*
-        indices = np.random.choice(
-            np.where(full_mask)[0],
-            size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION),
-            replace=False,
-        )
         # 2. compute losses for the batch in one shot
         candidate_losses = losses_for_candidates(
             weights, indices, estimate_matrix, targets, normalisation_factor
@@ -335,8 +320,6 @@ def random_sampling_minimization(
 
     household_weights_normalized = weights / weights.sum()
 
-    household_weights_normalized = weights / weights.sum()
-
     final_mask = None
     lowest_loss = float("inf")
     for fraction in target_fractions:
@@ -355,14 +338,6 @@ def random_sampling_minimization(
                     replace=False,
                 )
             ] = True
-            mask[
-                np.random.choice(
-                    n,
-                    target_size,
-                    p=household_weights_normalized if random else None,
-                    replace=False,
-                )
-            ] = True
 
             loss = get_loss_from_mask(
                 weights, mask, estimate_matrix, targets, normalisation_factor
@@ -402,14 +377,6 @@ def minimise_dataset(
     dataset = Dataset.from_file(dataset)
     loss_matrix, targets = build_loss_matrix(dataset, 2024)
 
-    bad_mask = loss_matrix.columns.isin(bad_targets)
-    keep_mask_bool = ~bad_mask
-    keep_idx = np.where(keep_mask_bool)[0]
-    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
-    targets_clean = targets[keep_idx]
-    assert loss_matrix_clean.shape[1] == targets_clean.size
-    loss_matrix, targets = build_loss_matrix(dataset, 2024)
-
     bad_mask = loss_matrix.columns.isin(bad_targets)
     keep_mask_bool = ~bad_mask
     keep_idx = np.where(keep_mask_bool)[0]
@@ -421,7 +388,6 @@ def minimise_dataset(
 
     weights = sim.calculate("household_weight", 2024).values
     is_national = loss_matrix_clean.columns.str.startswith("nation/")
-    is_national = loss_matrix_clean.columns.str.startswith("nation/")
     nation_normalisation_factor = is_national * (1 / is_national.sum())
     state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
     normalisation_factor = np.where(
@@ -433,11 +399,8 @@ def minimise_dataset(
         weights=weights,
         estimate_matrix=loss_matrix_clean,
         targets=targets_clean,
-        estimate_matrix=loss_matrix_clean,
-        targets=targets_clean,
         normalisation_factor=normalisation_factor,
         **kwargs,  # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
-        **kwargs,  # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
     )
 
     # Extract household IDs for remaining households
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 6683da0c..7c416e2a 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -13,228 +13,172 @@
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
     "import numpy as np\n",
-    "import os"
+    "import os\n",
+    "import h5py\n",
+    "\n",
+    "bad_targets = [\n",
+    "    \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household\",\n",
+    "    \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household\",\n",
+    "    \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n",
+    "    \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n",
+    "    \"nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household\",\n",
+    "    \"nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household\",\n",
+    "    \"nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n",
+    "    \"nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n",
+    "]"
    ]
   },
   {
    "cell_type": "code",
-<<<<<<< HEAD
-   "execution_count": 7,
-=======
    "execution_count": null,
-   "id": "6daabe7c",
+   "id": "683fd57e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Original ECPS 2024 dataset size (for household entity): 41310\n",
-    "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n",
-    "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n"
+    "# Length of household entity in the dataset measured through household_weight:\n",
+    "\n",
+    "# Original ECPS 2024 dataset size: 41310\n",
+    "# Through \"random_sampling_minimization\" with 0.5 of the dataset being pruned: 20655\n",
+    "# Through \"random_sampling_minimization\" with 0.2 of the dataset being pruned: 33408\n",
+    "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change: 20655 \n",
+    "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change: 24786"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "db975ac1",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n",
-      "Removed 4131 households with worst relative loss changes.\n",
-      "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n",
-      "Weight relative change: 99.10%\n",
-      "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n",
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
+    "## ALL TESTS\n",
+    "\n",
+    "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
+    "\n",
+    "input_dataset = ExtendedCPS_2024\n",
+    "\n",
+    "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
+    "\n",
+    "for approach in approaches:\n",
+    "    sim = Microsimulation(dataset=input_dataset)\n",
+    "    data = sim.dataset.load_dataset()\n",
+    "    data[\"household_weight\"] = {}\n",
+    "    original_weights = sim.calculate(\"household_weight\")\n",
+    "    original_weights = original_weights.values + np.random.normal(\n",
+    "        1, 0.1, len(original_weights)\n",
+    "    )\n",
+    "    for year in range(2024, 2025):\n",
+    "        loss_matrix, targets_array = build_loss_matrix(\n",
+    "            input_dataset, year\n",
+    "        )\n",
+    "\n",
+    "        bad_mask = loss_matrix.columns.isin(bad_targets)\n",
+    "        keep_mask_bool = ~bad_mask\n",
+    "        keep_idx = np.where(keep_mask_bool)[0]\n",
+    "        loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n",
+    "        targets_array_clean = targets_array[keep_idx]\n",
+    "        assert loss_matrix_clean.shape[1] == targets_array_clean.size\n",
+    "\n",
+    "        optimised_weights = reweight(\n",
+    "            original_weights,\n",
+    "            loss_matrix_clean,\n",
+    "            targets_array_clean,\n",
+    "            log_path=\"calibration_log.csv\",\n",
+    "            penalty_approach=approach,\n",
+    "            epochs=250,  # Reduced epochs for faster processing\n",
+    "        )\n",
+    "        data[\"household_weight\"][year] = optimised_weights\n",
+    "\n",
+    "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "    # Save to HDF5 file\n",
+    "    with h5py.File(output_path, \"w\") as f:\n",
+    "        for variable, values in data.items():\n",
+    "            for year, value in values.items():\n",
+    "                f.create_dataset(f\"{variable}/{year}\", data=value)\n",
+    "\n",
+    "\n",
     "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
     "\n",
     "files = [\n",
     "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
     "    ]\n",
     "\n",
-    "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n",
-    "minimization_function = candidate_loss_contribution\n",
+    "approaches = {\n",
+    "        \"random_sampling_minimization\": random_sampling_minimization,\n",
+    "        \"candidate_loss_contribution\": candidate_loss_contribution,\n",
+    "}\n",
+    "\n",
+    "optional_params = {\n",
+    "        \"random_sampling_minimization\": {\n",
+    "            \"target_fractions\": [0.5, 0.6, 0.7, 0.8, 0.9],  # fractions of the dataset to keep\n",
+    "        },\n",
+    "        \"candidate_loss_contribution\": {\n",
+    "            \"loss_rel_change_max\": [0.00001, 0.000001, 0.0000001] # maximum relative change in\n",
+    "        }\n",
+    "}\n",
+    "\n",
+    "for approach, function in approaches.items():\n",
+    "    minimization_function = function\n",
+    "    # other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
+    "\n",
+    "    for params, values in optional_params[approach].items():\n",
+    "        for value in values:\n",
+    "            if params == \"target_fractions\":\n",
+    "                for file in files:\n",
+    "                    output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n",
+    "                    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "                    minimise_dataset(\n",
+    "                        file,\n",
+    "                        output_path,\n",
+    "                        minimization_function=minimization_function, \n",
+    "                        target_fractions=[value]\n",
+    "                    )\n",
+    "            elif params == \"loss_rel_change_max\":\n",
+    "                for file in files:\n",
+    "                    output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n",
+    "                    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "                    minimise_dataset(\n",
+    "                        file,\n",
+    "                        output_path,\n",
+    "                        minimization_function=minimization_function, \n",
+    "                        loss_rel_change_max=value\n",
+    "                    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35892c9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## SMALL CHECKS BELOW -- IGNORE ---\n",
+    "\n",
+    "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
+    "\n",
+    "files = [\n",
+    "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
+    "    ]\n",
+    "\n",
+    "minimization_function = random_sampling_minimization\n",
     "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
     "\n",
     "for file in files:\n",
-    "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "    output_path = STORAGE_FOLDER / \"random_sampling_minimization\" / f\"{1.0}_enhanced_cps_2024_minimised.h5\"\n",
     "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "\n",
     "    minimise_dataset(\n",
     "        file,\n",
     "        output_path,\n",
     "        minimization_function=minimization_function, \n",
-    "        # target_fractions=[0.5] # remove if switching approach\n",
-    "        loss_rel_change_max=0.0001,  # remove if switching approach\n",
+    "        target_fractions=[1.0]\n",
     "    )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "b4cf8e89",
    "metadata": {},
    "outputs": [
@@ -287,14 +231,6 @@
     "        for year, value in values.items():\n",
     "            f.create_dataset(f\"{variable}/{year}\", data=value)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b4cf8e89",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

From 68349f80e0b2d6af2893eb738dde6c0a8b7eb9bd Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Mon, 14 Jul 2025 14:23:01 -0400
Subject: [PATCH 41/58] starting to collect results

---
 test_minimization_approach.ipynb | 152 ++++++++++++++++++++++++++++++-
 1 file changed, 148 insertions(+), 4 deletions(-)

diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 54f3c6fa..ffed7e46 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -5,7 +5,16 @@
    "execution_count": 1,
    "id": "d6dc9cca",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n",
     "from policyengine_us_data.storage import STORAGE_FOLDER\n",
@@ -268,18 +277,153 @@
     "data.save_dataset(output_path)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "fedc4fc7",
+   "metadata": {},
+   "source": [
+    "## FULL DATA DOWNLOAD PIPELINE"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2218e211",
+   "metadata": {},
+   "source": [
+    "Set up line plot dataframe, initializing it with the original enhanced_cps results."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b4cf8e89",
+   "id": "3b1bba26",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Create initial DataFrame and adding structure\n",
+    "results_df = pd.DataFrame({\n",
+    "    'strategy': ['none'],\n",
+    "    'parameter': ['none'],\n",
+    "    'dataset_size': [41600],\n",
+    "    'total_loss': [6.9e-3]\n",
+    "})\n",
+    "\n",
+    "def add_result(df, strategy, parameter, dataset_size, total_loss):\n",
+    "    new_rows = pd.DataFrame({\n",
+    "        'strategy': strategy,\n",
+    "        'parameter': parameter,\n",
+    "        'dataset_size': dataset_size,\n",
+    "        'total_loss': total_loss\n",
+    "    })\n",
+    "    return pd.concat([df, new_rows], ignore_index=True)\n",
+    "\n",
+    "# Example usage:\n",
+    "#df = add_result(df, ['L1', 'L2'], ['0.001','0.002'] , [35000, 4000], [7.2e-3, 7.2e-3])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6df48427",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>strategy</th>\n",
+       "      <th>parameter</th>\n",
+       "      <th>dataset_size</th>\n",
+       "      <th>total_loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>none</td>\n",
+       "      <td>none</td>\n",
+       "      <td>41600</td>\n",
+       "      <td>0.0069</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>L1</td>\n",
+       "      <td>0.001</td>\n",
+       "      <td>35000</td>\n",
+       "      <td>0.0072</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>L2</td>\n",
+       "      <td>0.002</td>\n",
+       "      <td>4000</td>\n",
+       "      <td>0.0072</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  strategy parameter  dataset_size  total_loss\n",
+       "0     none      none         41600      0.0069\n",
+       "1       L1     0.001         35000      0.0072\n",
+       "2       L2     0.002          4000      0.0072"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa483f59",
+   "metadata": {},
+   "source": [
+    "Collecting length of dataset and total loss values for every regularization strategy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9eb9602c",
    "metadata": {},
    "outputs": [],
    "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5023a3a",
+   "metadata": {},
+   "source": [
+    "### VISUALIZATION"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "pe",
+   "display_name": "policyengine-us-data",
    "language": "python",
    "name": "python3"
   },
@@ -293,7 +437,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.11"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,

From 4d593b99e049004ae354ca8a6349056c5b57108c Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Mon, 14 Jul 2025 17:07:22 -0400
Subject: [PATCH 42/58] added functionality for running multiple L0/L1 penalty
 values & dataframe for plotting

---
 .../datasets/cps/enhanced_cps.py              |  14 +-
 test_minimization_approach.ipynb              | 382 ++++++++++++++++--
 2 files changed, 349 insertions(+), 47 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 83fe6b99..851ea464 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -42,6 +42,8 @@ def reweight(
     epochs=500,
     log_path="calibration_log.csv",
     penalty_approach=None,
+    penalty_weight=None,  
+
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -60,7 +62,7 @@ def reweight(
     )
 
     # TO DO: replace this with a call to the python reweight.py package.
-    def loss(weights, penalty_approach=penalty_approach):
+    def loss(weights, penalty_approach=penalty_approach, penalty_weight=penalty_weight):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -77,13 +79,13 @@ def loss(weights, penalty_approach=penalty_approach):
         if torch.isnan(rel_error_normalized).any():
             raise ValueError("Relative error contains NaNs")
 
-        if penalty_approach is not None:
+        if penalty_approach is not None and penalty_weight is not None:
             # L0 penalty (approximated with smooth function)
             # Since L0 is non-differentiable, we use a smooth approximation
             # Common approaches:
 
             epsilon = 1e-3  # Threshold for "near zero"
-            l0_penalty_weight = 1e-1  # Adjust this hyperparameter
+
 
             # Option 1: Sigmoid approximation
             if penalty_approach == "l0_sigmoid":
@@ -101,15 +103,13 @@ def loss(weights, penalty_approach=penalty_approach):
             if penalty_approach == "l0_exp":
                 smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
 
-            # L1 penalty
-            l1_penalty_weight = 1e-2  # Adjust this hyperparameterxs
 
             if penalty_approach == "l1":
                 l1 = torch.mean(weights)
-                return rel_error_normalized.mean() + l1_penalty_weight * l1
+                return rel_error_normalized.mean() + penalty_weight * l1
 
             return (
-                rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0
+                rel_error_normalized.mean() + penalty_weight * smoothed_l0
             )
 
         else:
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 7c416e2a..bb77568a 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -15,6 +15,8 @@
     "import numpy as np\n",
     "import os\n",
     "import h5py\n",
+    "import pandas as pd\n",
+    "\n",
     "\n",
     "bad_targets = [\n",
     "    \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household\",\n",
@@ -58,45 +60,58 @@
     "input_dataset = ExtendedCPS_2024\n",
     "\n",
     "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
+    "penalty_weights = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]\n",
     "\n",
-    "for approach in approaches:\n",
-    "    sim = Microsimulation(dataset=input_dataset)\n",
-    "    data = sim.dataset.load_dataset()\n",
-    "    data[\"household_weight\"] = {}\n",
-    "    original_weights = sim.calculate(\"household_weight\")\n",
-    "    original_weights = original_weights.values + np.random.normal(\n",
-    "        1, 0.1, len(original_weights)\n",
-    "    )\n",
-    "    for year in range(2024, 2025):\n",
-    "        loss_matrix, targets_array = build_loss_matrix(\n",
-    "            input_dataset, year\n",
-    "        )\n",
-    "\n",
-    "        bad_mask = loss_matrix.columns.isin(bad_targets)\n",
-    "        keep_mask_bool = ~bad_mask\n",
-    "        keep_idx = np.where(keep_mask_bool)[0]\n",
-    "        loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n",
-    "        targets_array_clean = targets_array[keep_idx]\n",
-    "        assert loss_matrix_clean.shape[1] == targets_array_clean.size\n",
-    "\n",
-    "        optimised_weights = reweight(\n",
-    "            original_weights,\n",
-    "            loss_matrix_clean,\n",
-    "            targets_array_clean,\n",
-    "            log_path=\"calibration_log.csv\",\n",
-    "            penalty_approach=approach,\n",
-    "            epochs=250,  # Reduced epochs for faster processing\n",
-    "        )\n",
-    "        data[\"household_weight\"][year] = optimised_weights\n",
-    "\n",
-    "    output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n",
+    "def get_output_path(approach, file_name):\n",
+    "    output_path = STORAGE_FOLDER / approach / file_name\n",
     "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "    return output_path\n",
+    "\n",
+    "results = []\n",
     "\n",
-    "    # Save to HDF5 file\n",
-    "    with h5py.File(output_path, \"w\") as f:\n",
-    "        for variable, values in data.items():\n",
-    "            for year, value in values.items():\n",
-    "                f.create_dataset(f\"{variable}/{year}\", data=value)\n",
+    "for approach in approaches:\n",
+    "            for penalty_weight in penalty_weights:\n",
+    "                # Storing files in correct locations\n",
+    "                cal_log_name = f\"calibration_log_{approach}_{penalty_weight}.csv\"\n",
+    "                h5_name = f\"enhanced_cps_2024_{approach}_{penalty_weight}_minimised.h5\"\n",
+    "                cal_log_path = get_output_path(approach, cal_log_name)\n",
+    "                h5_path = get_output_path(approach, h5_name)\n",
+    "\n",
+    "                sim = Microsimulation(dataset=input_dataset)\n",
+    "                data = sim.dataset.load_dataset()\n",
+    "                data[\"household_weight\"] = {}\n",
+    "                original_weights = sim.calculate(\"household_weight\")\n",
+    "                original_weights = original_weights.values + np.random.normal(\n",
+    "                    1, 0.1, len(original_weights)\n",
+    "                )\n",
+    "                for year in range(2024, 2025):\n",
+    "                    loss_matrix, targets_array = build_loss_matrix(\n",
+    "                        input_dataset, year\n",
+    "                    )\n",
+    "\n",
+    "                    bad_mask = loss_matrix.columns.isin(bad_targets)\n",
+    "                    keep_mask_bool = ~bad_mask\n",
+    "                    keep_idx = np.where(keep_mask_bool)[0]\n",
+    "                    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n",
+    "                    targets_array_clean = targets_array[keep_idx]\n",
+    "                    assert loss_matrix_clean.shape[1] == targets_array_clean.size\n",
+    "\n",
+    "                    optimised_weights = reweight(\n",
+    "                        original_weights,\n",
+    "                        loss_matrix_clean,\n",
+    "                        targets_array_clean,\n",
+    "                        log_path=cal_log_path, \n",
+    "                        penalty_approach=approach,\n",
+    "                        penalty_weight=penalty_weight, \n",
+    "                        epochs=10,  # Reduced epochs for faster processing\n",
+    "                    )\n",
+    "                    data[\"household_weight\"][year] = optimised_weights\n",
+    "\n",
+    "                # Save to HDF5 file\n",
+    "                with h5py.File(h5_path, \"w\") as f:\n",
+    "                    for variable, values in data.items():\n",
+    "                        for year, value in values.items():\n",
+    "                            f.create_dataset(f\"{variable}/{year}\", data=value)\n",
     "\n",
     "\n",
     "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
@@ -115,7 +130,7 @@
     "            \"target_fractions\": [0.5, 0.6, 0.7, 0.8, 0.9],  # fractions of the dataset to keep\n",
     "        },\n",
     "        \"candidate_loss_contribution\": {\n",
-    "            \"loss_rel_change_max\": [0.00001, 0.000001, 0.0000001] # maximum relative change in\n",
+    "            \"loss_rel_change_max\": [0.001, 0.0001, 0.00001, 0.000001, 0.0000001] # maximum relative change in loss\n",
     "        }\n",
     "}\n",
     "\n",
@@ -149,10 +164,172 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
    "id": "35892c9d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n",
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n",
+      "Weight relative change: 100.00%\n",
+      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/random_sampling_minimization/1.0_enhanced_cps_2024_random_sampling_minimization_minimised.h5\n",
+      "Targeting Medicaid enrollment for AK with target 231577k\n",
+      "Targeting Medicaid enrollment for AL with target 766009k\n",
+      "Targeting Medicaid enrollment for AR with target 733561k\n",
+      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
+      "Targeting Medicaid enrollment for CA with target 12172695k\n",
+      "Targeting Medicaid enrollment for CO with target 1058326k\n",
+      "Targeting Medicaid enrollment for CT with target 904321k\n",
+      "Targeting Medicaid enrollment for DC with target 240020k\n",
+      "Targeting Medicaid enrollment for DE with target 236840k\n",
+      "Targeting Medicaid enrollment for FL with target 3568648k\n",
+      "Targeting Medicaid enrollment for GA with target 1699279k\n",
+      "Targeting Medicaid enrollment for HI with target 376318k\n",
+      "Targeting Medicaid enrollment for IA with target 586748k\n",
+      "Targeting Medicaid enrollment for ID with target 296968k\n",
+      "Targeting Medicaid enrollment for IL with target 2918179k\n",
+      "Targeting Medicaid enrollment for IN with target 1623361k\n",
+      "Targeting Medicaid enrollment for KS with target 335902k\n",
+      "Targeting Medicaid enrollment for KY with target 1244822k\n",
+      "Targeting Medicaid enrollment for LA with target 1377806k\n",
+      "Targeting Medicaid enrollment for MA with target 1453344k\n",
+      "Targeting Medicaid enrollment for MD with target 1280697k\n",
+      "Targeting Medicaid enrollment for ME with target 322306k\n",
+      "Targeting Medicaid enrollment for MI with target 2194067k\n",
+      "Targeting Medicaid enrollment for MN with target 1146667k\n",
+      "Targeting Medicaid enrollment for MO with target 1118780k\n",
+      "Targeting Medicaid enrollment for MS with target 514730k\n",
+      "Targeting Medicaid enrollment for MT with target 193278k\n",
+      "Targeting Medicaid enrollment for NC with target 2469712k\n",
+      "Targeting Medicaid enrollment for ND with target 100543k\n",
+      "Targeting Medicaid enrollment for NE with target 302971k\n",
+      "Targeting Medicaid enrollment for NH with target 166813k\n",
+      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
+      "Targeting Medicaid enrollment for NM with target 686825k\n",
+      "Targeting Medicaid enrollment for NV with target 713936k\n",
+      "Targeting Medicaid enrollment for NY with target 5946806k\n",
+      "Targeting Medicaid enrollment for OH with target 2596879k\n",
+      "Targeting Medicaid enrollment for OK with target 894911k\n",
+      "Targeting Medicaid enrollment for OR with target 1123313k\n",
+      "Targeting Medicaid enrollment for PA with target 2783389k\n",
+      "Targeting Medicaid enrollment for RI with target 273400k\n",
+      "Targeting Medicaid enrollment for SC with target 932515k\n",
+      "Targeting Medicaid enrollment for SD with target 126952k\n",
+      "Targeting Medicaid enrollment for TN with target 1268904k\n",
+      "Targeting Medicaid enrollment for TX with target 3821806k\n",
+      "Targeting Medicaid enrollment for UT with target 300742k\n",
+      "Targeting Medicaid enrollment for VA with target 1596777k\n",
+      "Targeting Medicaid enrollment for VT with target 151833k\n",
+      "Targeting Medicaid enrollment for WA with target 1776116k\n",
+      "Targeting Medicaid enrollment for WI with target 1108320k\n",
+      "Targeting Medicaid enrollment for WV with target 467632k\n",
+      "Targeting Medicaid enrollment for WY with target 57320k\n"
+     ]
+    }
+   ],
    "source": [
     "## SMALL CHECKS BELOW -- IGNORE ---\n",
     "\n",
@@ -166,7 +343,7 @@
     "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
     "\n",
     "for file in files:\n",
-    "    output_path = STORAGE_FOLDER / \"random_sampling_minimization\" / f\"{1.0}_enhanced_cps_2024_minimised.h5\"\n",
+    "    output_path = STORAGE_FOLDER / \"random_sampling_minimization\" / f\"{1.0}_enhanced_cps_2024_random_sampling_minimization_minimised.h5\"\n",
     "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
     "    minimise_dataset(\n",
     "        file,\n",
@@ -231,6 +408,131 @@
     "        for year, value in values.items():\n",
     "            f.create_dataset(f\"{variable}/{year}\", data=value)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8b0fe2e",
+   "metadata": {},
+   "source": [
+    "### Visualization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "225debd8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>strategy</th>\n",
+       "      <th>parameter</th>\n",
+       "      <th>dataset_size</th>\n",
+       "      <th>total_loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>original</td>\n",
+       "      <td>none</td>\n",
+       "      <td>41600</td>\n",
+       "      <td>0.0069</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>L1</td>\n",
+       "      <td>0.001</td>\n",
+       "      <td>35000</td>\n",
+       "      <td>0.0072</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>L2</td>\n",
+       "      <td>0.002</td>\n",
+       "      <td>4000</td>\n",
+       "      <td>0.0072</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   strategy parameter  dataset_size  total_loss\n",
+       "0  original      none         41600      0.0069\n",
+       "1        L1     0.001         35000      0.0072\n",
+       "2        L2     0.002          4000      0.0072"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"\"\"\n",
+    "Creating dataframe to store regularization results\n",
+    "\"\"\"\n",
+    "\n",
+    "# Initial dataframe setup\n",
+    "df = pd.DataFrame({\n",
+    "    'strategy': ['none'],\n",
+    "    'parameter': ['none'],\n",
+    "    'dataset_size': [41310],\n",
+    "    'total_loss': [6.9e-3]\n",
+    "})\n",
+    "\n",
+    "def add_result(df, strategy, parameter, dataset_size, total_loss):\n",
+    "    new_rows = pd.DataFrame({\n",
+    "        'strategy': strategy,        \n",
+    "        'parameter': parameter,      \n",
+    "        'dataset_size': dataset_size,\n",
+    "        'total_loss': total_loss\n",
+    "    })\n",
+    "    return pd.concat([df, new_rows], ignore_index=True)\n",
+    "\n",
+    "# Example usage\n",
+    "#df = add_result(df, ['L1', 'L2'], ['0.001','0.002'], [35000, 4000], [7.2e-3, 7.2e-3])\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bb3ef3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Pulling values from created calibration_log.csv and .h5 files to populate the line plot dataframe\n",
+    "\n",
+    "( I need to pull the strategy (folder name), parameter (from file title??), dataset size (from length of .h5 file), and total loss (from sum of loss column in calibration_log_file.csv))\n",
+    "\"\"\"\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
   }
  ],
  "metadata": {

From a8af62a73d438efce6e02600916ea7ff7d11272f Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Mon, 14 Jul 2025 20:56:08 -0400
Subject: [PATCH 43/58] pulling data from files for plotting

---
 test_minimization_approach.ipynb | 288 +++++++++++++++++++++++++++----
 1 file changed, 257 insertions(+), 31 deletions(-)

diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index bb77568a..e3d011af 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,10 +2,19 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "d6dc9cca",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n",
     "from policyengine_us_data.storage import STORAGE_FOLDER\n",
@@ -30,6 +39,82 @@
     "]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2f27c5ab",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 10/10 [00:04<00:00,  2.12it/s, loss=0.0101, loss_rel_change=-0.0371]\n",
+      "100%|██████████| 10/10 [00:05<00:00,  1.74it/s, loss=0.1, loss_rel_change=-0.00389]\n",
+      "100%|██████████| 10/10 [00:06<00:00,  1.62it/s, loss=3.22, loss_rel_change=-0.896]\n",
+      "100%|██████████| 10/10 [00:04<00:00,  2.15it/s, loss=32, loss_rel_change=-0.896] \n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "input_dataset = ExtendedCPS_2024\n",
+    "\n",
+    "approaches = [\"l0_exp\", \"l1\"] #[\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
+    "penalty_weights = [1e-2, 1e-1] #[1e-5, 1e-4, 1e-3, 1e-2, 1e-1]\n",
+    "\n",
+    "def get_output_path(approach, file_name):\n",
+    "    output_path = STORAGE_FOLDER / approach / file_name\n",
+    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "    return output_path\n",
+    "\n",
+    "results = []\n",
+    "\n",
+    "for approach in approaches:\n",
+    "            for penalty_weight in penalty_weights:\n",
+    "                # Storing files in correct locations\n",
+    "                cal_log_name = f\"calibration_log_{approach}_{penalty_weight}.csv\"\n",
+    "                h5_name = f\"enhanced_cps_2024_{approach}_{penalty_weight}_minimised.h5\"\n",
+    "                cal_log_path = get_output_path(approach, cal_log_name)\n",
+    "                h5_path = get_output_path(approach, h5_name)\n",
+    "\n",
+    "                sim = Microsimulation(dataset=input_dataset)\n",
+    "                data = sim.dataset.load_dataset()\n",
+    "                data[\"household_weight\"] = {}\n",
+    "                original_weights = sim.calculate(\"household_weight\")\n",
+    "                original_weights = original_weights.values + np.random.normal(\n",
+    "                    1, 0.1, len(original_weights)\n",
+    "                )\n",
+    "                for year in range(2024, 2025):\n",
+    "                    loss_matrix, targets_array = build_loss_matrix(\n",
+    "                        input_dataset, year\n",
+    "                    )\n",
+    "\n",
+    "                    bad_mask = loss_matrix.columns.isin(bad_targets)\n",
+    "                    keep_mask_bool = ~bad_mask\n",
+    "                    keep_idx = np.where(keep_mask_bool)[0]\n",
+    "                    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n",
+    "                    targets_array_clean = targets_array[keep_idx]\n",
+    "                    assert loss_matrix_clean.shape[1] == targets_array_clean.size\n",
+    "\n",
+    "                    optimised_weights = reweight(\n",
+    "                        original_weights,\n",
+    "                        loss_matrix_clean,\n",
+    "                        targets_array_clean,\n",
+    "                        log_path=cal_log_path, \n",
+    "                        penalty_approach=approach,\n",
+    "                        penalty_weight=penalty_weight, \n",
+    "                        epochs=10,  # Reduced epochs for faster processing\n",
+    "                    )\n",
+    "                    data[\"household_weight\"][year] = optimised_weights\n",
+    "\n",
+    "                # Save to HDF5 file\n",
+    "                with h5py.File(h5_path, \"w\") as f:\n",
+    "                    for variable, values in data.items():\n",
+    "                        for year, value in values.items():\n",
+    "                            f.create_dataset(f\"{variable}/{year}\", data=value)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -419,7 +504,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 33,
    "id": "225debd8",
    "metadata": {},
    "outputs": [
@@ -453,37 +538,21 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>original</td>\n",
        "      <td>none</td>\n",
-       "      <td>41600</td>\n",
+       "      <td>none</td>\n",
+       "      <td>41310</td>\n",
        "      <td>0.0069</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>L1</td>\n",
-       "      <td>0.001</td>\n",
-       "      <td>35000</td>\n",
-       "      <td>0.0072</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>L2</td>\n",
-       "      <td>0.002</td>\n",
-       "      <td>4000</td>\n",
-       "      <td>0.0072</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   strategy parameter  dataset_size  total_loss\n",
-       "0  original      none         41600      0.0069\n",
-       "1        L1     0.001         35000      0.0072\n",
-       "2        L2     0.002          4000      0.0072"
+       "  strategy parameter  dataset_size  total_loss\n",
+       "0     none      none         41310      0.0069"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -494,7 +563,7 @@
     "\"\"\"\n",
     "\n",
     "# Initial dataframe setup\n",
-    "df = pd.DataFrame({\n",
+    "reg_results_df = pd.DataFrame({\n",
     "    'strategy': ['none'],\n",
     "    'parameter': ['none'],\n",
     "    'dataset_size': [41310],\n",
@@ -508,19 +577,116 @@
     "        'dataset_size': dataset_size,\n",
     "        'total_loss': total_loss\n",
     "    })\n",
-    "    return pd.concat([df, new_rows], ignore_index=True)\n",
+    "    return pd.concat([reg_results_df, new_rows], ignore_index=True)\n",
     "\n",
     "# Example usage\n",
-    "#df = add_result(df, ['L1', 'L2'], ['0.001','0.002'], [35000, 4000], [7.2e-3, 7.2e-3])\n",
-    "df"
+    "#reg_results_df = add_result(reg_results_df, ['L1', 'L2'], ['0.001','0.002'], [35000, 4000], [7.2e-3, 7.2e-3])\n",
+    "reg_results_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 34,
    "id": "7bb3ef3c",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>strategy</th>\n",
+       "      <th>parameter</th>\n",
+       "      <th>dataset_size</th>\n",
+       "      <th>total_loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>none</td>\n",
+       "      <td>none</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>0.006900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>l0_exp</td>\n",
+       "      <td>0.01</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>1263.410322</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>l0_exp</td>\n",
+       "      <td>0.1</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>1263.410322</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>l0_exp</td>\n",
+       "      <td>0.1</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>1263.410322</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>l1</td>\n",
+       "      <td>0.01</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>1263.410322</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>l1</td>\n",
+       "      <td>0.1</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>1263.410322</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>l1</td>\n",
+       "      <td>0.1</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>1263.410322</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  strategy parameter  dataset_size   total_loss\n",
+       "0     none      none         41310     0.006900\n",
+       "1   l0_exp      0.01         41310  1263.410322\n",
+       "2   l0_exp       0.1         41310  1263.410322\n",
+       "3   l0_exp       0.1         41310  1263.410322\n",
+       "4       l1      0.01         41310  1263.410322\n",
+       "5       l1       0.1         41310  1263.410322\n",
+       "6       l1       0.1         41310  1263.410322"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "\"\"\"\n",
     "Pulling values from created calibration_log.csv and .h5 files to populate the line plot dataframe\n",
@@ -528,10 +694,70 @@
     "( I need to pull the strategy (folder name), parameter (from file title??), dataset size (from length of .h5 file), and total loss (from sum of loss column in calibration_log_file.csv))\n",
     "\"\"\"\n",
     "\n",
+    "approaches = [\"l0_exp\", \"l1\"] \n",
+    "penalty_weights = [1e-2, 1e-1]\n",
+    "\n",
+    "def get_output_path(approach, file_name):\n",
+    "    output_path = STORAGE_FOLDER / approach / file_name\n",
+    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "    return output_path\n",
+    "\n",
+    "for approach in approaches:\n",
+    "    total_size = []\n",
+    "    total_loss = []\n",
+    "    for penalty_weight in penalty_weights:\n",
+    "        strategy = approach\n",
+    "        parameter = penalty_weight\n",
+    "\n",
+    "        # Pull length of .h5 file\n",
+    "        h5_name = f\"enhanced_cps_2024_{strategy}_{parameter}_minimised.h5\"\n",
+    "        h5_path = get_output_path(strategy, h5_name)\n",
+    "        # see if this works\n",
+    "        dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n",
+    "        total_size.append(dataset_size)\n",
+    "\n",
+    "        # Pull sum of loss column\n",
+    "        cal_log_name = f\"calibration_log_{approach}_{penalty_weight}.csv\"\n",
+    "        cal_log_path = get_output_path(approach, cal_log_name)\n",
+    "        loss_sum = pd.read_csv(cal_log_path)['loss'].sum()\n",
+    "        total_loss.append(loss_sum)\n",
+    "\n",
+    "        reg_results_df = add_result(reg_results_df, strategy, parameter, total_size, total_loss)\n",
+    "        # does this weird recursion work?\n",
+    "\n",
+    "\n",
+    "\n",
     "\n",
+    "'''\n",
     "\n",
+    "fraction = [0.5, 0.6, 0.7, 0.8, 0.9]\n",
     "\n",
-    "\n"
+    "for fraction in fraction:\n",
+    "    strategy = \"random_sampling_minimization\"\n",
+    "    parameter = fraction\n",
+    "\n",
+    "    # Pull length of .h5 file\n",
+    "    h5_name = f\"{fraction}_enhanced_cps_2024_random_sampling_minimization_minimised.h5\"\n",
+    "    h5_path = STORAGE_FOLDER / strategy / h5_name\n",
+    "    dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n",
+    "\n",
+    "    # Pull sum of loss column\n",
+    "    cal_log_name = f\"{fraction}_enhanced_cps_2024_random_sampling_minimization_minimised_calibration_log.csv\"\n",
+    "    cal_log_path = STORAGE_FOLDER / strategy / cal_log_name\n",
+    "    total_loss = pd.read_csv(cal_log_path)['loss'].sum()\n",
+    "\n",
+    "    add_result(df, strategy, parameter, dataset_size, total_loss)\n",
+    "\n",
+    "'''\n",
+    "reg_results_df\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b203ccd",
+   "metadata": {},
+   "source": [
+    "## Plotting"
    ]
   }
  ],

From a917d35200d6aac3d25286c2abad7cad3b1b4ba3 Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Mon, 14 Jul 2025 21:15:06 -0400
Subject: [PATCH 44/58] deleted testing cell

---
 test_minimization_approach.ipynb | 105 +++++--------------------------
 1 file changed, 15 insertions(+), 90 deletions(-)

diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index e3d011af..972bd0b7 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,19 +2,10 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "d6dc9cca",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n",
     "from policyengine_us_data.storage import STORAGE_FOLDER\n",
@@ -39,82 +30,6 @@
     "]"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "2f27c5ab",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 10/10 [00:04<00:00,  2.12it/s, loss=0.0101, loss_rel_change=-0.0371]\n",
-      "100%|██████████| 10/10 [00:05<00:00,  1.74it/s, loss=0.1, loss_rel_change=-0.00389]\n",
-      "100%|██████████| 10/10 [00:06<00:00,  1.62it/s, loss=3.22, loss_rel_change=-0.896]\n",
-      "100%|██████████| 10/10 [00:04<00:00,  2.15it/s, loss=32, loss_rel_change=-0.896] \n"
-     ]
-    }
-   ],
-   "source": [
-    "\n",
-    "input_dataset = ExtendedCPS_2024\n",
-    "\n",
-    "approaches = [\"l0_exp\", \"l1\"] #[\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
-    "penalty_weights = [1e-2, 1e-1] #[1e-5, 1e-4, 1e-3, 1e-2, 1e-1]\n",
-    "\n",
-    "def get_output_path(approach, file_name):\n",
-    "    output_path = STORAGE_FOLDER / approach / file_name\n",
-    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "    return output_path\n",
-    "\n",
-    "results = []\n",
-    "\n",
-    "for approach in approaches:\n",
-    "            for penalty_weight in penalty_weights:\n",
-    "                # Storing files in correct locations\n",
-    "                cal_log_name = f\"calibration_log_{approach}_{penalty_weight}.csv\"\n",
-    "                h5_name = f\"enhanced_cps_2024_{approach}_{penalty_weight}_minimised.h5\"\n",
-    "                cal_log_path = get_output_path(approach, cal_log_name)\n",
-    "                h5_path = get_output_path(approach, h5_name)\n",
-    "\n",
-    "                sim = Microsimulation(dataset=input_dataset)\n",
-    "                data = sim.dataset.load_dataset()\n",
-    "                data[\"household_weight\"] = {}\n",
-    "                original_weights = sim.calculate(\"household_weight\")\n",
-    "                original_weights = original_weights.values + np.random.normal(\n",
-    "                    1, 0.1, len(original_weights)\n",
-    "                )\n",
-    "                for year in range(2024, 2025):\n",
-    "                    loss_matrix, targets_array = build_loss_matrix(\n",
-    "                        input_dataset, year\n",
-    "                    )\n",
-    "\n",
-    "                    bad_mask = loss_matrix.columns.isin(bad_targets)\n",
-    "                    keep_mask_bool = ~bad_mask\n",
-    "                    keep_idx = np.where(keep_mask_bool)[0]\n",
-    "                    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n",
-    "                    targets_array_clean = targets_array[keep_idx]\n",
-    "                    assert loss_matrix_clean.shape[1] == targets_array_clean.size\n",
-    "\n",
-    "                    optimised_weights = reweight(\n",
-    "                        original_weights,\n",
-    "                        loss_matrix_clean,\n",
-    "                        targets_array_clean,\n",
-    "                        log_path=cal_log_path, \n",
-    "                        penalty_approach=approach,\n",
-    "                        penalty_weight=penalty_weight, \n",
-    "                        epochs=10,  # Reduced epochs for faster processing\n",
-    "                    )\n",
-    "                    data[\"household_weight\"][year] = optimised_weights\n",
-    "\n",
-    "                # Save to HDF5 file\n",
-    "                with h5py.File(h5_path, \"w\") as f:\n",
-    "                    for variable, values in data.items():\n",
-    "                        for year, value in values.items():\n",
-    "                            f.create_dataset(f\"{variable}/{year}\", data=value)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -136,7 +51,17 @@
    "execution_count": null,
    "id": "db975ac1",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 10/10 [00:03<00:00,  3.00it/s, loss=9.1e-5, loss_rel_change=-0.809] \n",
+      "100%|██████████| 10/10 [00:03<00:00,  2.96it/s, loss=0.000181, loss_rel_change=-0.679]\n",
+      "100%|██████████| 10/10 [00:03<00:00,  2.98it/s, loss=0.00108, loss_rel_change=-0.273]\n"
+     ]
+    }
+   ],
    "source": [
     "## ALL TESTS\n",
     "\n",
@@ -763,7 +688,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "pe",
+   "display_name": "policyengine-us-data",
    "language": "python",
    "name": "python3"
   },
@@ -777,7 +702,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.11"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,

From 734f54f4325278996d9090df2cb896417581179d Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Tue, 15 Jul 2025 14:01:32 -0400
Subject: [PATCH 45/58]  current testing arena for Ben

---
 test_minimization_approach.ipynb | 665 +++++++++++++++++--------------
 1 file changed, 374 insertions(+), 291 deletions(-)

diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 972bd0b7..5407c3ea 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 66,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -46,6 +46,14 @@
     "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change: 24786"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "e99994d3",
+   "metadata": {},
+   "source": [
+    "# Enhanced_CPS_2024.py Approaches"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -58,7 +66,82 @@
      "text": [
       "100%|██████████| 10/10 [00:03<00:00,  3.00it/s, loss=9.1e-5, loss_rel_change=-0.809] \n",
       "100%|██████████| 10/10 [00:03<00:00,  2.96it/s, loss=0.000181, loss_rel_change=-0.679]\n",
-      "100%|██████████| 10/10 [00:03<00:00,  2.98it/s, loss=0.00108, loss_rel_change=-0.273]\n"
+      "100%|██████████| 10/10 [00:03<00:00,  2.98it/s, loss=0.00108, loss_rel_change=-0.273]\n",
+      "100%|██████████| 10/10 [00:02<00:00,  3.59it/s, loss=0.0101, loss_rel_change=-0.0377]\n",
+      "100%|██████████| 10/10 [00:02<00:00,  3.46it/s, loss=0.1, loss_rel_change=-0.00391]\n",
+      "100%|██████████| 10/10 [00:02<00:00,  3.52it/s, loss=0.000191, loss_rel_change=-0.672]\n",
+      "100%|██████████| 10/10 [00:03<00:00,  2.89it/s, loss=0.00116, loss_rel_change=-0.274]\n",
+      "100%|██████████| 10/10 [00:02<00:00,  3.57it/s, loss=0.00978, loss_rel_change=-0.166]\n",
+      "100%|██████████| 10/10 [00:02<00:00,  3.34it/s, loss=0.0881, loss_rel_change=-0.22]\n",
+      "100%|██████████| 10/10 [00:02<00:00,  3.55it/s, loss=0.866, loss_rel_change=-0.23]\n",
+      "100%|██████████| 10/10 [00:03<00:00,  3.31it/s, loss=9.12e-5, loss_rel_change=-0.812]\n",
+      "100%|██████████| 10/10 [00:03<00:00,  3.26it/s, loss=0.00018, loss_rel_change=-0.687]\n",
+      "100%|██████████| 10/10 [00:02<00:00,  3.47it/s, loss=0.00108, loss_rel_change=-0.263]\n",
+      "100%|██████████| 10/10 [00:03<00:00,  3.21it/s, loss=0.0101, loss_rel_change=-0.0373]\n",
+      "100%|██████████| 10/10 [00:02<00:00,  3.37it/s, loss=0.1, loss_rel_change=-0.00383]\n",
+      "100%|██████████| 10/10 [00:03<00:00,  3.28it/s, loss=0.00389, loss_rel_change=-0.875]\n",
+      "100%|██████████| 10/10 [00:03<00:00,  3.17it/s, loss=0.0328, loss_rel_change=-0.894]\n",
+      "100%|██████████| 10/10 [00:03<00:00,  2.72it/s, loss=0.321, loss_rel_change=-0.896]\n",
+      "100%|██████████| 10/10 [00:02<00:00,  3.49it/s, loss=3.21, loss_rel_change=-0.896]\n",
+      "100%|██████████| 10/10 [00:02<00:00,  3.37it/s, loss=32.1, loss_rel_change=-0.896]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== CALIBRATION LOG DEBUG ===\n",
+      "File path: /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5\n",
+      "Epoch: 0\n",
+      "Number of households: 41310\n",
+      "Total weight: 12764381616743.21\n",
+      "Weight range: 0.54 to 1303728.75\n",
+      "Loss matrix shape: (41310, 2813)\n",
+      "Number of targets: 2813\n",
+      "After filtering bad targets:\n",
+      "Loss matrix clean shape: (41310, 2805)\n",
+      "Number of clean targets: 2805\n",
+      "Estimates shape: (2805,)\n",
+      "Estimates sum: 324584770671300.88\n",
+      "First 3 estimates: nation/irs/adjusted gross income/total/AGI in -inf-inf/taxable/All    1.498784e+13\n",
+      "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/All     1.609638e+10\n",
+      "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/All     6.707770e+10\n",
+      "dtype: float64\n",
+      "First 3 targets: [1.62972204e+13 1.68634879e+10 6.76819729e+10]\n",
+      "Mean absolute error: 17235490830.73\n",
+      "Mean relative error: 0.0997\n",
+      "=== END DEBUG ===\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 250/250 [01:38<00:00,  2.54it/s, loss=3.62e-5, loss_rel_change=-0.301]\n",
+      "100%|██████████| 250/250 [01:35<00:00,  2.62it/s, loss=3.58e-5, loss_rel_change=-0.294]\n",
+      "100%|██████████| 250/250 [01:33<00:00,  2.68it/s, loss=3.34e-5, loss_rel_change=-0.376]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Weight relative change: 99.95%\n",
+      "Re-calibrating final selected households...\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'numpy.ndarray' object has no attribute 'columns'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 92\u001b[0m\n\u001b[1;32m     90\u001b[0m         output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     91\u001b[0m         output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 92\u001b[0m         \u001b[43mminimise_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     93\u001b[0m \u001b[43m            \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     94\u001b[0m \u001b[43m            \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     95\u001b[0m \u001b[43m            \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m     96\u001b[0m \u001b[43m            \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m     97\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     98\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m     99\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimise.py:430\u001b[0m, in \u001b[0;36mminimise_dataset\u001b[0;34m(dataset, output_path, minimization_function, **kwargs)\u001b[0m\n\u001b[1;32m    428\u001b[0m \u001b[38;5;66;03m# Re-calibrate the final selected households to hit targets\u001b[39;00m\n\u001b[1;32m    429\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRe-calibrating final selected households...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 430\u001b[0m calibrated_weights \u001b[38;5;241m=\u001b[39m \u001b[43mreweight\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    431\u001b[0m \u001b[43m    \u001b[49m\u001b[43minitial_weights\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    432\u001b[0m \u001b[43m    \u001b[49m\u001b[43mloss_matrix_clean\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Convert to numpy array\u001b[39;49;00m\n\u001b[1;32m    433\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtargets_clean\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    434\u001b[0m \u001b[43m    \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m250\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Reduced epochs for faster processing\u001b[39;49;00m\n\u001b[1;32m    435\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    436\u001b[0m sim\u001b[38;5;241m.\u001b[39mset_input(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m2024\u001b[39m, calibrated_weights)\n\u001b[1;32m    437\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFinal calibration completed successfully\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py:47\u001b[0m, in \u001b[0;36mreweight\u001b[0;34m(original_weights, loss_matrix, targets_array, dropout_rate, epochs, log_path, penalty_approach, penalty_weight)\u001b[0m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mreweight\u001b[39m(\n\u001b[1;32m     38\u001b[0m     original_weights,\n\u001b[1;32m     39\u001b[0m     loss_matrix,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     45\u001b[0m     penalty_weight\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     46\u001b[0m ):\n\u001b[0;32m---> 47\u001b[0m     target_names \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray(\u001b[43mloss_matrix\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m)\n\u001b[1;32m     48\u001b[0m     is_national \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnation/\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     49\u001b[0m     loss_matrix \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(loss_matrix\u001b[38;5;241m.\u001b[39mvalues, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'"
      ]
     }
    ],
@@ -66,7 +149,6 @@
     "## ALL TESTS\n",
     "\n",
     "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
-    "\n",
     "input_dataset = ExtendedCPS_2024\n",
     "\n",
     "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
@@ -113,7 +195,7 @@
     "                        log_path=cal_log_path, \n",
     "                        penalty_approach=approach,\n",
     "                        penalty_weight=penalty_weight, \n",
-    "                        epochs=10,  # Reduced epochs for faster processing\n",
+    "                        epochs=250,  # Reduced epochs for faster processing\n",
     "                    )\n",
     "                    data[\"household_weight\"][year] = optimised_weights\n",
     "\n",
@@ -121,9 +203,83 @@
     "                with h5py.File(h5_path, \"w\") as f:\n",
     "                    for variable, values in data.items():\n",
     "                        for year, value in values.items():\n",
-    "                            f.create_dataset(f\"{variable}/{year}\", data=value)\n",
-    "\n",
-    "\n",
+    "                            f.create_dataset(f\"{variable}/{year}\", data=value)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69ff392d",
+   "metadata": {},
+   "source": [
+    "# Minimise.py approaches"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "aeab67b3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== CALIBRATION LOG DEBUG ===\n",
+      "File path: /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5\n",
+      "Epoch: 0\n",
+      "Number of households: 41310\n",
+      "Total weight: 12764381616743.21\n",
+      "Weight range: 0.54 to 1303728.75\n",
+      "Loss matrix shape: (41310, 2813)\n",
+      "Number of targets: 2813\n",
+      "After filtering bad targets:\n",
+      "Loss matrix clean shape: (41310, 2805)\n",
+      "Number of clean targets: 2805\n",
+      "Estimates shape: (2805,)\n",
+      "Estimates sum: 324584770671300.88\n",
+      "First 3 estimates: nation/irs/adjusted gross income/total/AGI in -inf-inf/taxable/All    1.498784e+13\n",
+      "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/All     1.609638e+10\n",
+      "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/All     6.707770e+10\n",
+      "dtype: float64\n",
+      "First 3 targets: [1.62972204e+13 1.68634879e+10 6.76819729e+10]\n",
+      "Mean absolute error: 17235490830.73\n",
+      "Mean relative error: 0.0997\n",
+      "=== END DEBUG ===\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 250/250 [03:38<00:00,  1.14it/s, loss=3.34e-5, loss_rel_change=-0.357]\n",
+      "100%|██████████| 250/250 [02:39<00:00,  1.57it/s, loss=3.52e-5, loss_rel_change=-0.334]\n",
+      "100%|██████████| 250/250 [01:32<00:00,  2.70it/s, loss=3.39e-5, loss_rel_change=-0.34] \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Weight relative change: 99.95%\n",
+      "Re-calibrating final selected households...\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'numpy.ndarray' object has no attribute 'columns'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[67], line 31\u001b[0m\n\u001b[1;32m     29\u001b[0m         output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     30\u001b[0m         output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 31\u001b[0m         \u001b[43mminimise_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     32\u001b[0m \u001b[43m            \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     33\u001b[0m \u001b[43m            \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     34\u001b[0m \u001b[43m            \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m     35\u001b[0m \u001b[43m            \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m     36\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m     38\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimise.py:430\u001b[0m, in \u001b[0;36mminimise_dataset\u001b[0;34m(dataset, output_path, minimization_function, **kwargs)\u001b[0m\n\u001b[1;32m    428\u001b[0m \u001b[38;5;66;03m# Re-calibrate the final selected households to hit targets\u001b[39;00m\n\u001b[1;32m    429\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRe-calibrating final selected households...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 430\u001b[0m calibrated_weights \u001b[38;5;241m=\u001b[39m \u001b[43mreweight\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    431\u001b[0m \u001b[43m    \u001b[49m\u001b[43minitial_weights\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    432\u001b[0m \u001b[43m    \u001b[49m\u001b[43mloss_matrix_clean\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Convert to numpy array\u001b[39;49;00m\n\u001b[1;32m    433\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtargets_clean\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    434\u001b[0m \u001b[43m    \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Reduced epochs for faster processing\u001b[39;49;00m\n\u001b[1;32m    435\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    436\u001b[0m sim\u001b[38;5;241m.\u001b[39mset_input(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m2024\u001b[39m, calibrated_weights)\n\u001b[1;32m    437\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFinal calibration completed successfully\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py:47\u001b[0m, in \u001b[0;36mreweight\u001b[0;34m(original_weights, loss_matrix, targets_array, dropout_rate, epochs, log_path, penalty_approach, penalty_weight)\u001b[0m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mreweight\u001b[39m(\n\u001b[1;32m     38\u001b[0m     original_weights,\n\u001b[1;32m     39\u001b[0m     loss_matrix,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     45\u001b[0m     penalty_weight\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     46\u001b[0m ):\n\u001b[0;32m---> 47\u001b[0m     target_names \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray(\u001b[43mloss_matrix\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m)\n\u001b[1;32m     48\u001b[0m     is_national \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnation/\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     49\u001b[0m     loss_matrix \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(loss_matrix\u001b[38;5;241m.\u001b[39mvalues, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'"
+     ]
+    }
+   ],
+   "source": [
     "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
     "\n",
     "files = [\n",
@@ -137,10 +293,10 @@
     "\n",
     "optional_params = {\n",
     "        \"random_sampling_minimization\": {\n",
-    "            \"target_fractions\": [0.5, 0.6, 0.7, 0.8, 0.9],  # fractions of the dataset to keep\n",
+    "            \"target_fractions\": [0.5, 0.6]#, 0.7, 0.8, 0.9],  # fractions of the dataset to keep\n",
     "        },\n",
     "        \"candidate_loss_contribution\": {\n",
-    "            \"loss_rel_change_max\": [0.001, 0.0001, 0.00001, 0.000001, 0.0000001] # maximum relative change in loss\n",
+    "            \"loss_rel_change_max\": [0.001, 0.0001]#, 0.00001, 0.000001, 0.0000001] # maximum relative change in loss\n",
     "        }\n",
     "}\n",
     "\n",
@@ -173,250 +329,22 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "35892c9d",
+   "cell_type": "markdown",
+   "id": "fa1ea957",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n",
-      "Weight relative change: 100.00%\n",
-      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/random_sampling_minimization/1.0_enhanced_cps_2024_random_sampling_minimization_minimised.h5\n",
-      "Targeting Medicaid enrollment for AK with target 231577k\n",
-      "Targeting Medicaid enrollment for AL with target 766009k\n",
-      "Targeting Medicaid enrollment for AR with target 733561k\n",
-      "Targeting Medicaid enrollment for AZ with target 1778734k\n",
-      "Targeting Medicaid enrollment for CA with target 12172695k\n",
-      "Targeting Medicaid enrollment for CO with target 1058326k\n",
-      "Targeting Medicaid enrollment for CT with target 904321k\n",
-      "Targeting Medicaid enrollment for DC with target 240020k\n",
-      "Targeting Medicaid enrollment for DE with target 236840k\n",
-      "Targeting Medicaid enrollment for FL with target 3568648k\n",
-      "Targeting Medicaid enrollment for GA with target 1699279k\n",
-      "Targeting Medicaid enrollment for HI with target 376318k\n",
-      "Targeting Medicaid enrollment for IA with target 586748k\n",
-      "Targeting Medicaid enrollment for ID with target 296968k\n",
-      "Targeting Medicaid enrollment for IL with target 2918179k\n",
-      "Targeting Medicaid enrollment for IN with target 1623361k\n",
-      "Targeting Medicaid enrollment for KS with target 335902k\n",
-      "Targeting Medicaid enrollment for KY with target 1244822k\n",
-      "Targeting Medicaid enrollment for LA with target 1377806k\n",
-      "Targeting Medicaid enrollment for MA with target 1453344k\n",
-      "Targeting Medicaid enrollment for MD with target 1280697k\n",
-      "Targeting Medicaid enrollment for ME with target 322306k\n",
-      "Targeting Medicaid enrollment for MI with target 2194067k\n",
-      "Targeting Medicaid enrollment for MN with target 1146667k\n",
-      "Targeting Medicaid enrollment for MO with target 1118780k\n",
-      "Targeting Medicaid enrollment for MS with target 514730k\n",
-      "Targeting Medicaid enrollment for MT with target 193278k\n",
-      "Targeting Medicaid enrollment for NC with target 2469712k\n",
-      "Targeting Medicaid enrollment for ND with target 100543k\n",
-      "Targeting Medicaid enrollment for NE with target 302971k\n",
-      "Targeting Medicaid enrollment for NH with target 166813k\n",
-      "Targeting Medicaid enrollment for NJ with target 1506239k\n",
-      "Targeting Medicaid enrollment for NM with target 686825k\n",
-      "Targeting Medicaid enrollment for NV with target 713936k\n",
-      "Targeting Medicaid enrollment for NY with target 5946806k\n",
-      "Targeting Medicaid enrollment for OH with target 2596879k\n",
-      "Targeting Medicaid enrollment for OK with target 894911k\n",
-      "Targeting Medicaid enrollment for OR with target 1123313k\n",
-      "Targeting Medicaid enrollment for PA with target 2783389k\n",
-      "Targeting Medicaid enrollment for RI with target 273400k\n",
-      "Targeting Medicaid enrollment for SC with target 932515k\n",
-      "Targeting Medicaid enrollment for SD with target 126952k\n",
-      "Targeting Medicaid enrollment for TN with target 1268904k\n",
-      "Targeting Medicaid enrollment for TX with target 3821806k\n",
-      "Targeting Medicaid enrollment for UT with target 300742k\n",
-      "Targeting Medicaid enrollment for VA with target 1596777k\n",
-      "Targeting Medicaid enrollment for VT with target 151833k\n",
-      "Targeting Medicaid enrollment for WA with target 1776116k\n",
-      "Targeting Medicaid enrollment for WI with target 1108320k\n",
-      "Targeting Medicaid enrollment for WV with target 467632k\n",
-      "Targeting Medicaid enrollment for WY with target 57320k\n"
-     ]
-    }
-   ],
    "source": [
-    "## SMALL CHECKS BELOW -- IGNORE ---\n",
-    "\n",
-    "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
-    "\n",
-    "files = [\n",
-    "        STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n",
-    "    ]\n",
-    "\n",
-    "minimization_function = random_sampling_minimization\n",
-    "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n",
-    "\n",
-    "for file in files:\n",
-    "    output_path = STORAGE_FOLDER / \"random_sampling_minimization\" / f\"{1.0}_enhanced_cps_2024_random_sampling_minimization_minimised.h5\"\n",
-    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "    minimise_dataset(\n",
-    "        file,\n",
-    "        output_path,\n",
-    "        minimization_function=minimization_function, \n",
-    "        target_fractions=[1.0]\n",
-    "    )"
+    "### (Temporary) Cleaning of data (removing weights smaller than epsilon)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b4cf8e89",
+   "id": "e88df261",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 250/250 [01:24<00:00,  2.98it/s, loss=3.37e-5, loss_rel_change=-0.92] \n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "input_dataset = ExtendedCPS_2024\n",
-    "\n",
-    "sim = Microsimulation(dataset=input_dataset)\n",
-    "data = sim.dataset.load_dataset()\n",
-    "data[\"household_weight\"] = {}\n",
-    "original_weights = sim.calculate(\"household_weight\")\n",
-    "original_weights = original_weights.values + np.random.normal(\n",
-    "    1, 0.1, len(original_weights)\n",
-    ")\n",
-    "for year in range(2024, 2025):\n",
-    "    loss_matrix, targets_array = build_loss_matrix(\n",
-    "        input_dataset, year\n",
-    "    )\n",
-    "\n",
-    "    bad_mask = loss_matrix.columns.isin(bad_targets)\n",
-    "    keep_mask_bool = ~bad_mask\n",
-    "    keep_idx = np.where(keep_mask_bool)[0]\n",
-    "    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n",
-    "    targets_array_clean = targets_array[keep_idx]\n",
-    "    assert loss_matrix_clean.shape[1] == targets_array_clean.size\n",
-    "    assert loss_matrix_clean.shape[1] != loss_matrix.shape[1]\n",
-    "\n",
-    "    optimised_weights = reweight(\n",
-    "        original_weights,\n",
-    "        loss_matrix_clean,\n",
-    "        targets_array_clean,\n",
-    "        log_path=\"baseline_calibration_log.csv\",\n",
-    "        epochs=250,  # Reduced epochs for faster processing\n",
-    "    )\n",
-    "    data[\"household_weight\"][year] = optimised_weights\n",
-    "\n",
-    "output_path = STORAGE_FOLDER / \"baseline\" / \"enhanced_cps_2024_baseline.h5\"\n",
-    "output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "\n",
-    "# Save to HDF5 file\n",
-    "with h5py.File(output_path, \"w\") as f:\n",
-    "    for variable, values in data.items():\n",
-    "        for year, value in values.items():\n",
-    "            f.create_dataset(f\"{variable}/{year}\", data=value)"
+    "## this should go in the enhanced_cps_2024.py file, because household removal doesn't happen there\n",
+    "# Need to check Ben's PR."
    ]
   },
   {
@@ -429,7 +357,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 62,
    "id": "225debd8",
    "metadata": {},
    "outputs": [
@@ -477,16 +405,31 @@
        "0     none      none         41310      0.0069"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 62,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "\"\"\"\n",
+    "Creating scoring of loss\n",
     "Creating dataframe to store regularization results\n",
     "\"\"\"\n",
     "\n",
+    "# Calculate quality categories\n",
+    "def loss_score(calibration_log):\n",
+    "    excellent_count = (\n",
+    "        calibration_log[\"rel_abs_error\"] < 0.05).sum()  # < 5% error\n",
+    "    good_count = (\n",
+    "        (calibration_log[\"rel_abs_error\"] >= 0.05)\n",
+    "        & (calibration_log[\"rel_abs_error\"] < 0.20)).sum()  # 5-20% error\n",
+    "    total_targets = len(calibration_log)\n",
+    "    # Calculate quality score\n",
+    "    quality_score = (excellent_count * 100 + good_count * 75) / total_targets\n",
+    "    return quality_score\n",
+    "\n",
+    "\n",
+    "\n",
     "# Initial dataframe setup\n",
     "reg_results_df = pd.DataFrame({\n",
     "    'strategy': ['none'],\n",
@@ -499,8 +442,8 @@
     "    new_rows = pd.DataFrame({\n",
     "        'strategy': strategy,        \n",
     "        'parameter': parameter,      \n",
-    "        'dataset_size': dataset_size,\n",
-    "        'total_loss': total_loss\n",
+    "        'dataset_size': [dataset_size],\n",
+    "        'total_loss': [total_loss]\n",
     "    })\n",
     "    return pd.concat([reg_results_df, new_rows], ignore_index=True)\n",
     "\n",
@@ -511,7 +454,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 63,
    "id": "7bb3ef3c",
    "metadata": {},
    "outputs": [
@@ -548,66 +491,210 @@
        "      <td>none</td>\n",
        "      <td>none</td>\n",
        "      <td>41310</td>\n",
-       "      <td>0.006900</td>\n",
+       "      <td>0.0069</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>l0_exp</td>\n",
-       "      <td>0.01</td>\n",
+       "      <td>l0_sigmoid</td>\n",
+       "      <td>1.0</td>\n",
        "      <td>41310</td>\n",
-       "      <td>1263.410322</td>\n",
+       "      <td>0.0069</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>l0_exp</td>\n",
+       "      <td>l0_sigmoid</td>\n",
        "      <td>0.1</td>\n",
        "      <td>41310</td>\n",
-       "      <td>1263.410322</td>\n",
+       "      <td>39.2959</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
+       "      <td>l0_sigmoid</td>\n",
+       "      <td>0.01</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>l0_sigmoid</td>\n",
+       "      <td>0.001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>l0_sigmoid</td>\n",
+       "      <td>0.0001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>l0_sigmoid</td>\n",
+       "      <td>0.00001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>l0_log</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>0.0069</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>l0_log</td>\n",
+       "      <td>0.1</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>l0_log</td>\n",
+       "      <td>0.01</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>l0_log</td>\n",
+       "      <td>0.001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>l0_log</td>\n",
+       "      <td>0.0001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>l0_log</td>\n",
+       "      <td>0.00001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>l0_exp</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>0.0069</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
        "      <td>l0_exp</td>\n",
        "      <td>0.1</td>\n",
        "      <td>41310</td>\n",
-       "      <td>1263.410322</td>\n",
+       "      <td>39.2959</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>l1</td>\n",
+       "      <th>15</th>\n",
+       "      <td>l0_exp</td>\n",
        "      <td>0.01</td>\n",
        "      <td>41310</td>\n",
-       "      <td>1263.410322</td>\n",
+       "      <td>39.2959</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5</th>\n",
+       "      <th>16</th>\n",
+       "      <td>l0_exp</td>\n",
+       "      <td>0.001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>l0_exp</td>\n",
+       "      <td>0.0001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>l0_exp</td>\n",
+       "      <td>0.00001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
        "      <td>l1</td>\n",
-       "      <td>0.1</td>\n",
+       "      <td>1.0</td>\n",
        "      <td>41310</td>\n",
-       "      <td>1263.410322</td>\n",
+       "      <td>0.0069</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6</th>\n",
+       "      <th>20</th>\n",
        "      <td>l1</td>\n",
        "      <td>0.1</td>\n",
        "      <td>41310</td>\n",
-       "      <td>1263.410322</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>l1</td>\n",
+       "      <td>0.01</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>l1</td>\n",
+       "      <td>0.001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>l1</td>\n",
+       "      <td>0.0001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>l1</td>\n",
+       "      <td>0.00001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>39.2959</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "  strategy parameter  dataset_size   total_loss\n",
-       "0     none      none         41310     0.006900\n",
-       "1   l0_exp      0.01         41310  1263.410322\n",
-       "2   l0_exp       0.1         41310  1263.410322\n",
-       "3   l0_exp       0.1         41310  1263.410322\n",
-       "4       l1      0.01         41310  1263.410322\n",
-       "5       l1       0.1         41310  1263.410322\n",
-       "6       l1       0.1         41310  1263.410322"
+       "      strategy parameter  dataset_size  total_loss\n",
+       "0         none      none         41310      0.0069\n",
+       "1   l0_sigmoid       1.0         41310      0.0069\n",
+       "2   l0_sigmoid       0.1         41310     39.2959\n",
+       "3   l0_sigmoid      0.01         41310     39.2959\n",
+       "4   l0_sigmoid     0.001         41310     39.2959\n",
+       "5   l0_sigmoid    0.0001         41310     39.2959\n",
+       "6   l0_sigmoid   0.00001         41310     39.2959\n",
+       "7       l0_log       1.0         41310      0.0069\n",
+       "8       l0_log       0.1         41310     39.2959\n",
+       "9       l0_log      0.01         41310     39.2959\n",
+       "10      l0_log     0.001         41310     39.2959\n",
+       "11      l0_log    0.0001         41310     39.2959\n",
+       "12      l0_log   0.00001         41310     39.2959\n",
+       "13      l0_exp       1.0         41310      0.0069\n",
+       "14      l0_exp       0.1         41310     39.2959\n",
+       "15      l0_exp      0.01         41310     39.2959\n",
+       "16      l0_exp     0.001         41310     39.2959\n",
+       "17      l0_exp    0.0001         41310     39.2959\n",
+       "18      l0_exp   0.00001         41310     39.2959\n",
+       "19          l1       1.0         41310      0.0069\n",
+       "20          l1       0.1         41310     39.2959\n",
+       "21          l1      0.01         41310     39.2959\n",
+       "22          l1     0.001         41310     39.2959\n",
+       "23          l1    0.0001         41310     39.2959\n",
+       "24          l1   0.00001         41310     39.2959"
       ]
      },
-     "execution_count": 34,
+     "execution_count": 63,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -617,21 +704,19 @@
     "Pulling values from created calibration_log.csv and .h5 files to populate the line plot dataframe\n",
     "\n",
     "( I need to pull the strategy (folder name), parameter (from file title??), dataset size (from length of .h5 file), and total loss (from sum of loss column in calibration_log_file.csv))\n",
-    "\"\"\"\n",
     "\n",
     "approaches = [\"l0_exp\", \"l1\"] \n",
     "penalty_weights = [1e-2, 1e-1]\n",
-    "\n",
-    "def get_output_path(approach, file_name):\n",
-    "    output_path = STORAGE_FOLDER / approach / file_name\n",
-    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "    return output_path\n",
+    "\"\"\"\n",
+    "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
+    "penalty_weights = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]\n",
+    "og_size = 41310  # Original size of the dataset\n",
+    "og_loss = 6.9e-3  # Original loss from the baseline dataset\n",
     "\n",
     "for approach in approaches:\n",
-    "    total_size = []\n",
-    "    total_loss = []\n",
+    "    strategy = approach\n",
+    "    reg_results_df = add_result(reg_results_df, strategy, 1.0, og_size, og_loss)\n",
     "    for penalty_weight in penalty_weights:\n",
-    "        strategy = approach\n",
     "        parameter = penalty_weight\n",
     "\n",
     "        # Pull length of .h5 file\n",
@@ -639,17 +724,15 @@
     "        h5_path = get_output_path(strategy, h5_name)\n",
     "        # see if this works\n",
     "        dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n",
-    "        total_size.append(dataset_size)\n",
+    "        #total_size.append(dataset_size)\n",
     "\n",
     "        # Pull sum of loss column\n",
-    "        cal_log_name = f\"calibration_log_{approach}_{penalty_weight}.csv\"\n",
-    "        cal_log_path = get_output_path(approach, cal_log_name)\n",
-    "        loss_sum = pd.read_csv(cal_log_path)['loss'].sum()\n",
-    "        total_loss.append(loss_sum)\n",
-    "\n",
-    "        reg_results_df = add_result(reg_results_df, strategy, parameter, total_size, total_loss)\n",
-    "        # does this weird recursion work?\n",
-    "\n",
+    "        cal_log_name = f\"calibration_log_{strategy}_{parameter}.csv\"\n",
+    "        cal_log_path = get_output_path(strategy, cal_log_name)\n",
+    "        calibration_log = pd.read_csv(cal_log_path)\n",
+    "        loss_value = loss_score(calibration_log)\n",
+    "        \n",
+    "        reg_results_df = add_result(reg_results_df, strategy, parameter, dataset_size, loss_value)\n",
     "\n",
     "\n",
     "\n",

From 64c81498fd314b922fe04185724bccb0dbaa8524 Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Tue, 15 Jul 2025 15:02:16 -0400
Subject: [PATCH 46/58] not much new

---
 test_minimization_approach.ipynb | 43 ++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 5407c3ea..ea561155 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -767,6 +767,49 @@
    "source": [
     "## Plotting"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9602953a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "'''\n",
+    "Synthetic dataset\n",
+    "'''\n",
+    "\n",
+    "# Define values\n",
+    "strategies = ['l0_sigmoid', 'l0_log', 'l0_exp', 'l1']\n",
+    "parameters = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001]\n",
+    "\n",
+    "# Synthetic values\n",
+    "base_size = 41310\n",
+    "min_size = 20000\n",
+    "base_loss = 0.0069\n",
+    "max_loss = 40.0\n",
+    "\n",
+    "# Construct rows\n",
+    "rows = [{'strategy': 'none', 'parameter': 'none', 'dataset_size': base_size, 'total_loss': base_loss}]\n",
+    "\n",
+    "for strategy in strategies:\n",
+    "    for i, param in enumerate(parameters):\n",
+    "        # Gradually decrease size and increase loss\n",
+    "        size = int(base_size - (base_size - min_size) * (i / (len(parameters) - 1)))\n",
+    "        loss = round(base_loss + (max_loss - base_loss) * (i / (len(parameters) - 1)), 4)\n",
+    "        rows.append({\n",
+    "            'strategy': strategy,\n",
+    "            'parameter': param,\n",
+    "            'dataset_size': size,\n",
+    "            'total_loss': loss\n",
+    "        })\n",
+    "\n",
+    "# Create DataFrame\n",
+    "reg_results_df = pd.DataFrame(rows)\n",
+    "\n",
+    "# Display\n",
+    "print(reg_results_df)"
+   ]
   }
  ],
  "metadata": {

From 226b2d91d725f881887c67b55f0bba7f67ec4ada Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Tue, 15 Jul 2025 15:06:17 -0400
Subject: [PATCH 47/58] synthetic dataset

---
 policyengine_us_data/datasets/cps/enhanced_cps.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 851ea464..3da4f571 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -42,8 +42,7 @@ def reweight(
     epochs=500,
     log_path="calibration_log.csv",
     penalty_approach=None,
-    penalty_weight=None,  
-
+    penalty_weight=None,
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -62,7 +61,11 @@ def reweight(
     )
 
     # TO DO: replace this with a call to the python reweight.py package.
-    def loss(weights, penalty_approach=penalty_approach, penalty_weight=penalty_weight):
+    def loss(
+        weights,
+        penalty_approach=penalty_approach,
+        penalty_weight=penalty_weight,
+    ):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
             raise ValueError("Weights contain NaNs")
@@ -86,7 +89,6 @@ def loss(weights, penalty_approach=penalty_approach, penalty_weight=penalty_weig
 
             epsilon = 1e-3  # Threshold for "near zero"
 
-
             # Option 1: Sigmoid approximation
             if penalty_approach == "l0_sigmoid":
                 smoothed_l0 = torch.sigmoid(
@@ -103,14 +105,11 @@ def loss(weights, penalty_approach=penalty_approach, penalty_weight=penalty_weig
             if penalty_approach == "l0_exp":
                 smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean()
 
-
             if penalty_approach == "l1":
                 l1 = torch.mean(weights)
                 return rel_error_normalized.mean() + penalty_weight * l1
 
-            return (
-                rel_error_normalized.mean() + penalty_weight * smoothed_l0
-            )
+            return rel_error_normalized.mean() + penalty_weight * smoothed_l0
 
         else:
             return rel_error_normalized.mean()

From 6a8160b93cc7b7cc4d794df0c2451ddf8465aa63 Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Tue, 15 Jul 2025 15:06:56 -0400
Subject: [PATCH 48/58] committing before changing file

---
 policyengine_us_data/utils/minimise.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
index b3e0ed1a..fc86b14d 100644
--- a/policyengine_us_data/utils/minimise.py
+++ b/policyengine_us_data/utils/minimise.py
@@ -192,7 +192,7 @@ def get_loss_from_mask(
         included_weights,
         included_estimate_matrix,
         targets,
-        epochs=250,
+        epochs=10,
     )
 
     # Put calibrated weights back into full array
@@ -431,7 +431,7 @@ def minimise_dataset(
         initial_weights,
         loss_matrix_clean.values,  # Convert to numpy array
         targets_clean,
-        epochs=250,  # Reduced epochs for faster processing
+        epochs=10,  # Reduced epochs for faster processing
     )
     sim.set_input("household_weight", 2024, calibrated_weights)
     print("Final calibration completed successfully")

From 842dfa6d3c34d35cccfbe62fd6141d459e217df7 Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Tue, 15 Jul 2025 15:10:59 -0400
Subject: [PATCH 49/58] Merge minimize.py from maria/ecps_minimization branch

---
 policyengine_us_data/utils/minimize.py | 444 +++++++++++++++++++++++++
 1 file changed, 444 insertions(+)
 create mode 100644 policyengine_us_data/utils/minimize.py

diff --git a/policyengine_us_data/utils/minimize.py b/policyengine_us_data/utils/minimize.py
new file mode 100644
index 00000000..ce2c6fdf
--- /dev/null
+++ b/policyengine_us_data/utils/minimize.py
@@ -0,0 +1,444 @@
+from policyengine_us_data.utils.loss import build_loss_matrix
+from policyengine_core.data import Dataset
+from policyengine_us import Microsimulation
+import numpy as np
+import pandas as pd
+import h5py
+from policyengine_us_data.storage import STORAGE_FOLDER
+from typing import Optional, Callable
+
+bad_targets = [
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+]
+
+
+def create_calibration_log_file(file_path, epoch=0):
+    dataset = Dataset.from_file(file_path)
+    sim = Microsimulation(dataset=dataset)
+
+    loss_matrix, targets = build_loss_matrix(dataset, 2024)
+
+    bad_mask = loss_matrix.columns.isin(bad_targets)
+    keep_mask_bool = ~bad_mask
+    keep_idx = np.where(keep_mask_bool)[0]
+    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+    targets_clean = targets[keep_idx]
+
+    assert loss_matrix_clean.shape[1] == targets_clean.size
+
+    estimates = (
+        sim.calculate("household_weight", 2024).values @ loss_matrix_clean
+    )
+    target_names = loss_matrix_clean.columns
+
+    # Calculate and print some key metrics
+    errors = estimates - targets_clean
+    rel_errors = errors / targets_clean
+
+    df = pd.DataFrame(
+        {
+            "target_name": target_names,
+            "estimate": estimates,
+            "target": targets_clean,
+        }
+    )
+    df["epoch"] = epoch
+    df["error"] = df["estimate"] - df["target"]
+    df["rel_error"] = df["error"] / df["target"]
+    df["abs_error"] = df["error"].abs()
+    df["rel_abs_error"] = (
+        df["abs_error"] / df["target"].abs()
+        if df["target"].abs().sum() > 0
+        else np.nan
+    )
+    df["loss"] = (df["rel_error"] ** 2).mean()
+
+    df.to_csv(
+        str(file_path).replace(".h5", "_calibration_log.csv"), index=False
+    )
+
+
+def losses_for_candidates(
+    base_weights: np.ndarray,
+    idxs: np.ndarray,
+    est_mat: np.ndarray,
+    targets: np.ndarray,
+    norm: np.ndarray,
+    chunk_size: Optional[int] = 25_000,
+) -> np.ndarray:
+    """
+    Return the loss value *for each* candidate deletion in `idxs`
+    in one matrix multiplication.
+
+    Parameters
+    ----------
+    base_weights : (n,) original weight vector
+    idxs         : (k,) candidate row indices to zero-out
+    est_mat      : (n, m) estimate matrix
+    targets      : (m,) calibration targets
+    norm         : (m,) normalisation factors
+    chunk_size   : max number of candidates to process at once
+
+    Returns
+    -------
+    losses       : (k,) loss if row i were removed (and weights rescaled)
+    """
+    W = base_weights
+    total = W.sum()
+    k = len(idxs)
+    losses = np.empty(k, dtype=float)
+
+    # Work through the candidate list in blocks
+    for start in range(0, k, chunk_size):
+        stop = min(start + chunk_size, k)
+        part = idxs[start:stop]  # (p,) where p ≤ chunk_size
+        p = len(part)
+
+        # Build the delta matrix only for this chunk
+        delta = np.zeros((p, len(W)))
+        delta[np.arange(p), part] = -W[part]
+
+        keep_total = total + delta.sum(axis=1)  # (p,)
+        delta *= (total / keep_total)[:, None]
+
+        # Matrix–matrix multiply → one matrix multiplication per chunk
+        ests = (W + delta) @ est_mat  # (p, m)
+        rel_err = ((ests - targets) + 1) / (targets + 1)
+        losses[start:stop] = ((rel_err * norm) ** 2).mean(axis=1)
+
+    return losses
+
+
+def get_loss_from_mask(
+    weights, inclusion_mask, estimate_matrix, targets, normalisation_factor
+):
+    """
+    Calculate the loss based on the inclusion mask and the estimate matrix.
+    """
+    # Step 1: Apply mask and rescale weights
+    masked_weights = weights.copy()
+    original_weight_total = masked_weights.sum()
+    if (~inclusion_mask).sum() > 0:
+        masked_weights[~inclusion_mask] = 0
+    masked_weight_total = masked_weights.sum()
+    masked_weights[inclusion_mask] *= (
+        original_weight_total / masked_weight_total
+    )
+
+    # Step 2: Re-calibrate the masked weights to hit targets
+    # Only calibrate the included households
+    included_weights = masked_weights[inclusion_mask]
+    included_estimate_matrix = estimate_matrix.iloc[
+        inclusion_mask
+    ]  # Keep as DataFrame
+
+    # Call reweight function to calibrate the selected households
+    from policyengine_us_data.datasets.cps.enhanced_cps import reweight
+
+    calibrated_weights_included = reweight(
+        included_weights,
+        included_estimate_matrix,
+        targets,
+        epochs=250,
+    )
+
+    # Put calibrated weights back into full array
+    calibrated_weights = np.zeros_like(masked_weights)
+    calibrated_weights[inclusion_mask] = calibrated_weights_included
+
+    # Calculate estimates and loss from calibrated weights
+    estimates = calibrated_weights @ estimate_matrix
+    rel_error = ((estimates - targets) + 1) / (targets + 1)
+    loss = ((rel_error * normalisation_factor) ** 2).mean()
+
+    return loss
+
+
+def candidate_loss_contribution(
+    weights: np.ndarray,
+    estimate_matrix: np.ndarray,
+    targets: np.ndarray,
+    normalisation_factor: np.ndarray,
+    loss_rel_change_max: float,
+    count_iterations: int = 5,
+    view_fraction_per_iteration: float = 0.5,
+    fraction_remove_per_iteration: float = 0.05,
+) -> np.ndarray:
+    """
+    Minimization approach based on candidate loss contribution.
+
+    This function iteratively removes households that contribute least to the loss,
+    maintaining the calibration quality within the specified tolerance.
+
+    Parameters
+    ----------
+    weights : (n,) household weights
+    estimate_matrix : (n, m) matrix mapping weights to estimates
+    targets : (m,) calibration targets
+    normalisation_factor : (m,) normalisation factors for different targets
+    loss_rel_change_max : maximum allowed relative change in loss
+    count_iterations : number of iterations to perform
+    view_fraction_per_iteration : fraction of households to evaluate each iteration
+    fraction_remove_per_iteration : fraction of households to remove each iteration
+
+    Returns
+    -------
+    inclusion_mask : (n,) boolean mask of households to keep
+    """
+    from tqdm import tqdm
+
+    full_mask = np.ones_like(weights, dtype=bool)
+
+    for i in range(count_iterations):
+        inclusion_mask = full_mask.copy()
+        baseline_loss = get_loss_from_mask(
+            weights,
+            inclusion_mask,
+            estimate_matrix,
+            targets,
+            normalisation_factor,
+        )
+
+        # Sample only households that are currently included
+        indices = np.random.choice(
+            np.where(full_mask)[0],
+            size=int(full_mask.sum() * view_fraction_per_iteration),
+            replace=False,
+        )
+        # 2. compute losses for the batch in one shot
+        candidate_losses = losses_for_candidates(
+            weights, indices, estimate_matrix, targets, normalisation_factor
+        )
+        # 3. convert to relative change vs. baseline
+        household_loss_rel_changes = (
+            candidate_losses - baseline_loss
+        ) / baseline_loss
+
+        inclusion_mask = full_mask.copy()
+        household_loss_rel_changes = np.array(household_loss_rel_changes)
+        # Sort by the relative change in loss
+        sorted_indices = np.argsort(household_loss_rel_changes)
+
+        # Remove the worst households
+        num_to_remove = int(len(weights) * fraction_remove_per_iteration)
+        worst_indices = indices[sorted_indices[:num_to_remove]]
+        inclusion_mask[worst_indices] = False
+
+        # Calculate the new loss
+        new_loss = get_loss_from_mask(
+            weights,
+            inclusion_mask,
+            estimate_matrix,
+            targets,
+            normalisation_factor,
+        )
+        rel_change = (new_loss - baseline_loss) / baseline_loss
+
+        if rel_change > loss_rel_change_max:
+            print(
+                f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, "
+                f"which is too high ({rel_change:.2%}). Stopping."
+            )
+            break
+
+        print(
+            f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}"
+        )
+        print(
+            f"Removed {num_to_remove} households with worst relative loss changes."
+        )
+
+        # Update the full mask
+        full_mask &= inclusion_mask
+
+    return full_mask
+
+
+def random_sampling_minimization(
+    weights,
+    estimate_matrix,
+    targets,
+    normalisation_factor,
+    random=True,
+    target_fractions=[0.5, 0.6, 0.7, 0.8, 0.9],
+):
+    """A simple random sampling approach"""
+    n = len(weights)
+
+    household_weights_normalized = weights / weights.sum()
+
+    final_mask = None
+    lowest_loss = float("inf")
+    for fraction in target_fractions:
+        target_size = int(n * fraction)
+        # Random sampling with multiple attempts
+        best_mask = None
+        best_loss = float("inf")
+
+        for _ in range(3):  # Try 3 random samples
+            mask = np.zeros(n, dtype=bool)
+            mask[
+                np.random.choice(
+                    n,
+                    target_size,
+                    p=household_weights_normalized if random else None,
+                    replace=False,
+                )
+            ] = True
+
+            loss = get_loss_from_mask(
+                weights, mask, estimate_matrix, targets, normalisation_factor
+            )
+
+            if loss < best_loss:
+                best_loss = loss
+                best_mask = mask
+
+        if lowest_loss > best_loss:
+            lowest_loss = best_loss
+            final_mask = best_mask
+
+    return final_mask
+
+
+def minimize_dataset(
+    dataset,
+    output_path: str,
+    minimization_function: Callable = candidate_loss_contribution,
+    loss_matrix: Optional[pd.DataFrame] = None,
+    targets: Optional[np.ndarray] = None,
+    **kwargs,
+) -> None:
+    """
+    Main function to minimize a dataset using a specified minimization approach.
+
+    Parameters
+    ----------
+    dataset : path to the dataset file or Dataset object
+    output_path : path where the minimized dataset will be saved
+    loss_rel_change_max : maximum allowed relative change in loss
+    minimization_function : function that implements the minimization logic
+    **kwargs : additional arguments to pass to the minimization function
+    """
+    # Handle both dataset class and file path
+    if hasattr(dataset, "file_path"):
+        dataset_path = str(dataset.file_path)
+    else:
+        dataset_path = str(dataset)
+
+    create_calibration_log_file(dataset_path)
+
+    dataset = Dataset.from_file(dataset_path)
+    if loss_matrix is None or targets is None:
+        loss_matrix, targets = build_loss_matrix(dataset, 2024)
+
+        bad_mask = loss_matrix.columns.isin(bad_targets)
+        keep_mask_bool = ~bad_mask
+        keep_idx = np.where(keep_mask_bool)[0]
+        loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+        targets_clean = targets[keep_idx]
+        assert loss_matrix_clean.shape[1] == targets_clean.size
+    else:
+        loss_matrix_clean = loss_matrix
+        targets_clean = targets
+
+    sim = Microsimulation(dataset=dataset)
+
+    weights = sim.calculate("household_weight", 2024).values
+    is_national = loss_matrix_clean.columns.str.startswith("nation/")
+    nation_normalisation_factor = is_national * (1 / is_national.sum())
+    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
+    normalisation_factor = np.where(
+        is_national, nation_normalisation_factor, state_normalisation_factor
+    )
+
+    # Call the minimization function
+    inclusion_mask = minimization_function(
+        weights=weights,
+        estimate_matrix=loss_matrix_clean,
+        targets=targets_clean,
+        normalisation_factor=normalisation_factor,
+        **kwargs,  # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
+    )
+
+    # Extract household IDs for remaining households
+    household_ids = sim.calculate("household_id", 2024).values
+    remaining_households = household_ids[inclusion_mask]
+
+    # Create a smaller dataset with only the remaining households
+    df = sim.to_input_dataframe()
+    smaller_df = df[df["household_id__2024"].isin(remaining_households)]
+
+    weight_rel_change = (
+        smaller_df["household_weight__2024"].sum()
+        / df["household_weight__2024"].sum()
+    )
+    print(f"Weight relative change: {weight_rel_change:.2%}")
+
+    # Create new simulation with smaller dataset
+    sim = Microsimulation(dataset=smaller_df)
+
+    # Rescale weights to maintain total
+    initial_weights = (
+        sim.calculate("household_weight", 2024).values / weight_rel_change
+    )
+
+    # Re-calibrate the final selected households to hit targets
+    print("Re-calibrating final selected households...")
+
+    # Build loss matrix for the smaller dataset
+    smaller_loss_matrix, smaller_targets = build_loss_matrix(sim.dataset, 2024)
+
+    # Apply same filtering as before
+    bad_mask = smaller_loss_matrix.columns.isin(bad_targets)
+    keep_mask_bool = ~bad_mask
+    keep_idx = np.where(keep_mask_bool)[0]
+    smaller_loss_matrix_clean = smaller_loss_matrix.iloc[:, keep_idx]
+    smaller_targets_clean = smaller_targets[keep_idx]
+
+    from policyengine_us_data.datasets.cps.enhanced_cps import reweight
+
+    calibrated_weights = reweight(
+        initial_weights,
+        smaller_loss_matrix_clean,  # Now matches the smaller dataset size
+        smaller_targets_clean,
+        epochs=250,  # Reduced epochs for faster processing
+    )
+    sim.set_input("household_weight", 2024, calibrated_weights)
+    print("Final calibration completed successfully")
+    # Prepare data for saving
+    data = {}
+    for variable in sim.input_variables:
+        data[variable] = {2024: sim.calculate(variable, 2024).values}
+        if data[variable][2024].dtype == "object":
+            data[variable][2024] = data[variable][2024].astype("S")
+
+    # Save to HDF5 file
+    with h5py.File(output_path, "w") as f:
+        for variable, values in data.items():
+            for year, value in values.items():
+                f.create_dataset(f"{variable}/{year}", data=value)
+
+    print(f"Saved minimised dataset to {output_path}")
+    create_calibration_log_file(output_path, epoch=250)
+
+
+if __name__ == "__main__":
+    # Example usage
+    files = [
+        STORAGE_FOLDER / "enhanced_cps_2024.h5",
+    ]
+
+    for file in files:
+        output_path = file.with_name(file.stem + "_minimised.h5")
+        minimize_dataset(
+            file,
+            output_path,
+        )

From f815c7eb523991be87627f82f2514f1635292f2e Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Tue, 15 Jul 2025 15:12:46 -0400
Subject: [PATCH 50/58] renaming to american naming (maria started it haha)

---
 policyengine_us_data/utils/minimise.py | 466 -------------------------
 1 file changed, 466 deletions(-)
 delete mode 100644 policyengine_us_data/utils/minimise.py

diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py
deleted file mode 100644
index fc86b14d..00000000
--- a/policyengine_us_data/utils/minimise.py
+++ /dev/null
@@ -1,466 +0,0 @@
-from policyengine_us_data.utils.loss import build_loss_matrix
-from policyengine_core.data import Dataset
-from policyengine_us import Microsimulation
-import numpy as np
-import pandas as pd
-import h5py
-from policyengine_us_data.storage import STORAGE_FOLDER
-from typing import Optional, Callable
-from policyengine_us_data.datasets.cps.enhanced_cps import reweight
-
-bad_targets = [
-    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
-    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
-    "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
-    "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
-    "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
-    "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
-    "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
-    "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
-]
-
-
-def create_calibration_log_file(file_path, epoch=0):
-    print(f"=== CALIBRATION LOG DEBUG ===")
-    print(f"File path: {file_path}")
-    print(f"Epoch: {epoch}")
-
-    dataset = Dataset.from_file(file_path)
-    sim = Microsimulation(dataset=dataset)
-
-    # Debug: Print dataset info
-    household_weights = sim.calculate("household_weight", 2024)
-    print(f"Number of households: {len(household_weights)}")
-    print(f"Total weight: {household_weights.sum():.2f}")
-    print(
-        f"Weight range: {household_weights.min():.2f} to {household_weights.max():.2f}"
-    )
-
-    loss_matrix, targets = build_loss_matrix(dataset, 2024)
-    print(f"Loss matrix shape: {loss_matrix.shape}")
-    print(f"Number of targets: {len(targets)}")
-
-    bad_mask = loss_matrix.columns.isin(bad_targets)
-    keep_mask_bool = ~bad_mask
-    keep_idx = np.where(keep_mask_bool)[0]
-    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
-    targets_clean = targets[keep_idx]
-
-    print(f"After filtering bad targets:")
-    print(f"Loss matrix clean shape: {loss_matrix_clean.shape}")
-    print(f"Number of clean targets: {len(targets_clean)}")
-
-    assert loss_matrix_clean.shape[1] == targets_clean.size
-
-    estimates = (
-        sim.calculate("household_weight", 2024).values @ loss_matrix_clean
-    )
-    target_names = loss_matrix_clean.columns
-
-    # Debug: Print estimate statistics
-    print(f"Estimates shape: {estimates.shape}")
-    print(f"Estimates sum: {estimates.sum():.2f}")
-    print(f"First 3 estimates: {estimates[:3]}")
-    print(f"First 3 targets: {targets_clean[:3]}")
-
-    # Calculate and print some key metrics
-    errors = estimates - targets_clean
-    rel_errors = errors / targets_clean
-    print(f"Mean absolute error: {np.abs(errors).mean():.2f}")
-    print(f"Mean relative error: {np.abs(rel_errors).mean():.4f}")
-    print(f"=== END DEBUG ===\n")
-
-    df = pd.DataFrame(
-        {
-            "target_name": target_names,
-            "estimate": estimates,
-            "target": targets_clean,
-        }
-    )
-    df["epoch"] = epoch
-    df["error"] = df["estimate"] - df["target"]
-    df["rel_error"] = df["error"] / df["target"]
-    df["abs_error"] = df["error"].abs()
-    df["rel_abs_error"] = (
-        df["abs_error"] / df["target"].abs()
-        if df["target"].abs().sum() > 0
-        else np.nan
-    )
-    df["loss"] = (df["rel_error"] ** 2).mean()
-
-    df.to_csv(
-        str(file_path).replace(".h5", "_calibration_log.csv"), index=False
-    )
-
-
-def losses_for_candidates(
-    base_weights: np.ndarray,
-    idxs: np.ndarray,
-    est_mat: np.ndarray,
-    targets: np.ndarray,
-    norm: np.ndarray,
-    chunk_size: Optional[int] = 25_000,
-) -> np.ndarray:
-    """
-    Return the loss value *for each* candidate deletion in `idxs`
-    in one matrix multiplication.
-
-    Parameters
-    ----------
-    base_weights : (n,) original weight vector
-    idxs         : (k,) candidate row indices to zero-out
-    est_mat      : (n, m) estimate matrix
-    targets      : (m,) calibration targets
-    norm         : (m,) normalisation factors
-    chunk_size   : max number of candidates to process at once
-
-    Returns
-    -------
-    losses       : (k,) loss if row i were removed (and weights rescaled)
-    """
-    W = base_weights
-    total = W.sum()
-    k = len(idxs)
-    losses = np.empty(k, dtype=float)
-
-    # Work through the candidate list in blocks
-    for start in range(0, k, chunk_size):
-        stop = min(start + chunk_size, k)
-        part = idxs[start:stop]  # (p,) where p ≤ chunk_size
-        p = len(part)
-
-        # Build the delta matrix only for this chunk
-        delta = np.zeros((p, len(W)))
-        delta[np.arange(p), part] = -W[part]
-
-        keep_total = total + delta.sum(axis=1)  # (p,)
-        delta *= (total / keep_total)[:, None]
-
-        # Matrix–matrix multiply → one matrix multiplication per chunk
-        ests = (W + delta) @ est_mat  # (p, m)
-        rel_err = ((ests - targets) + 1) / (targets + 1)
-        losses[start:stop] = ((rel_err * norm) ** 2).mean(axis=1)
-
-    return losses
-
-
-def minimise_dataset(
-    dataset, output_path: str, loss_rel_change_max: float
-) -> None:
-    dataset = str(dataset)
-    create_calibration_log_file(dataset)
-
-    dataset = Dataset.from_file(dataset)
-    loss_matrix = build_loss_matrix(dataset, 2024)
-
-    sim = Microsimulation(dataset=dataset)
-
-    weights = sim.calculate("household_weight", 2024).values
-    estimate_matrix, targets = loss_matrix
-    is_national = estimate_matrix.columns.str.startswith("nation/")
-    nation_normalisation_factor = is_national * (1 / is_national.sum())
-    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
-    normalisation_factor = np.where(
-        is_national, nation_normalisation_factor, state_normalisation_factor
-    )
-    weights @ estimate_matrix
-
-
-def get_loss_from_mask(
-    weights, inclusion_mask, estimate_matrix, targets, normalisation_factor
-):
-    """
-    Calculate the loss based on the inclusion mask and the estimate matrix.
-    """
-    # Step 1: Apply mask and rescale weights
-    masked_weights = weights.copy()
-    original_weight_total = masked_weights.sum()
-    if (~inclusion_mask).sum() > 0:
-        masked_weights[~inclusion_mask] = 0
-    masked_weight_total = masked_weights.sum()
-    masked_weights[inclusion_mask] *= (
-        original_weight_total / masked_weight_total
-    )
-
-    # Step 2: Re-calibrate the masked weights to hit targets
-    # Only calibrate the included households
-    included_weights = masked_weights[inclusion_mask]
-    included_estimate_matrix = estimate_matrix[inclusion_mask]
-
-    # Call reweight function to calibrate the selected households
-    calibrated_weights_included = reweight(
-        included_weights,
-        included_estimate_matrix,
-        targets,
-        epochs=10,
-    )
-
-    # Put calibrated weights back into full array
-    calibrated_weights = np.zeros_like(masked_weights)
-    calibrated_weights[inclusion_mask] = calibrated_weights_included
-
-    # Calculate estimates and loss from calibrated weights
-    estimates = calibrated_weights @ estimate_matrix
-    rel_error = ((estimates - targets) + 1) / (targets + 1)
-    loss = ((rel_error * normalisation_factor) ** 2).mean()
-
-    return loss
-
-
-def candidate_loss_contribution(
-    weights: np.ndarray,
-    estimate_matrix: np.ndarray,
-    targets: np.ndarray,
-    normalisation_factor: np.ndarray,
-    loss_rel_change_max: float,
-    count_iterations: int = 5,
-    view_fraction_per_iteration: float = 0.3,
-    fraction_remove_per_iteration: float = 0.1,
-) -> np.ndarray:
-    """
-    Minimization approach based on candidate loss contribution.
-
-    This function iteratively removes households that contribute least to the loss,
-    maintaining the calibration quality within the specified tolerance.
-
-    Parameters
-    ----------
-    weights : (n,) household weights
-    estimate_matrix : (n, m) matrix mapping weights to estimates
-    targets : (m,) calibration targets
-    normalisation_factor : (m,) normalisation factors for different targets
-    loss_rel_change_max : maximum allowed relative change in loss
-    count_iterations : number of iterations to perform
-    view_fraction_per_iteration : fraction of households to evaluate each iteration
-    fraction_remove_per_iteration : fraction of households to remove each iteration
-
-    Returns
-    -------
-    inclusion_mask : (n,) boolean mask of households to keep
-    """
-    from tqdm import tqdm
-
-    full_mask = np.ones_like(weights, dtype=bool)
-
-    for i in range(count_iterations):
-        inclusion_mask = full_mask.copy()
-        baseline_loss = get_loss_from_mask(
-            weights,
-            inclusion_mask,
-            estimate_matrix,
-            targets,
-            normalisation_factor,
-        )
-
-        # Sample only households that are currently included
-        indices = np.random.choice(
-            np.where(full_mask)[0],
-            size=int(full_mask.sum() * view_fraction_per_iteration),
-            replace=False,
-        )
-        # 2. compute losses for the batch in one shot
-        candidate_losses = losses_for_candidates(
-            weights, indices, estimate_matrix, targets, normalisation_factor
-        )
-        # 3. convert to relative change vs. baseline
-        household_loss_rel_changes = (
-            candidate_losses - baseline_loss
-        ) / baseline_loss
-
-        inclusion_mask = full_mask.copy()
-        household_loss_rel_changes = np.array(household_loss_rel_changes)
-        # Sort by the relative change in loss
-        sorted_indices = np.argsort(household_loss_rel_changes)
-
-        # Remove the worst households
-        num_to_remove = int(len(weights) * fraction_remove_per_iteration)
-        worst_indices = indices[sorted_indices[:num_to_remove]]
-        inclusion_mask[worst_indices] = False
-
-        # Calculate the new loss
-        new_loss = get_loss_from_mask(
-            weights,
-            inclusion_mask,
-            estimate_matrix,
-            targets,
-            normalisation_factor,
-        )
-        rel_change = (new_loss - baseline_loss) / baseline_loss
-
-        if rel_change > loss_rel_change_max:
-            print(
-                f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, "
-                f"which is too high ({rel_change:.2%}). Stopping."
-            )
-            break
-
-        print(
-            f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}"
-        )
-        print(
-            f"Removed {num_to_remove} households with worst relative loss changes."
-        )
-
-        # Update the full mask
-        full_mask &= inclusion_mask
-
-    return full_mask
-
-
-def random_sampling_minimization(
-    weights,
-    estimate_matrix,
-    targets,
-    normalisation_factor,
-    random=True,
-    target_fractions=[0.5, 0.6, 0.7, 0.8, 0.9],
-):
-    """A simple random sampling approach"""
-    n = len(weights)
-
-    household_weights_normalized = weights / weights.sum()
-
-    final_mask = None
-    lowest_loss = float("inf")
-    for fraction in target_fractions:
-        target_size = int(n * fraction)
-        # Random sampling with multiple attempts
-        best_mask = None
-        best_loss = float("inf")
-
-        for _ in range(3):  # Try 3 random samples
-            mask = np.zeros(n, dtype=bool)
-            mask[
-                np.random.choice(
-                    n,
-                    target_size,
-                    p=household_weights_normalized if random else None,
-                    replace=False,
-                )
-            ] = True
-
-            loss = get_loss_from_mask(
-                weights, mask, estimate_matrix, targets, normalisation_factor
-            )
-
-            if loss < best_loss:
-                best_loss = loss
-                best_mask = mask
-
-        if lowest_loss > best_loss:
-            lowest_loss = best_loss
-            final_mask = best_mask
-
-    return final_mask
-
-
-def minimise_dataset(
-    dataset,
-    output_path: str,
-    minimization_function: Callable = candidate_loss_contribution,
-    **kwargs,
-) -> None:
-    """
-    Main function to minimize a dataset using a specified minimization approach.
-
-    Parameters
-    ----------
-    dataset : path to the dataset file or Dataset object
-    output_path : path where the minimized dataset will be saved
-    loss_rel_change_max : maximum allowed relative change in loss
-    minimization_function : function that implements the minimization logic
-    **kwargs : additional arguments to pass to the minimization function
-    """
-    dataset = str(dataset)
-    create_calibration_log_file(dataset)
-
-    dataset = Dataset.from_file(dataset)
-    loss_matrix, targets = build_loss_matrix(dataset, 2024)
-
-    bad_mask = loss_matrix.columns.isin(bad_targets)
-    keep_mask_bool = ~bad_mask
-    keep_idx = np.where(keep_mask_bool)[0]
-    loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
-    targets_clean = targets[keep_idx]
-    assert loss_matrix_clean.shape[1] == targets_clean.size
-
-    sim = Microsimulation(dataset=dataset)
-
-    weights = sim.calculate("household_weight", 2024).values
-    is_national = loss_matrix_clean.columns.str.startswith("nation/")
-    nation_normalisation_factor = is_national * (1 / is_national.sum())
-    state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
-    normalisation_factor = np.where(
-        is_national, nation_normalisation_factor, state_normalisation_factor
-    )
-
-    # Call the minimization function
-    inclusion_mask = minimization_function(
-        weights=weights,
-        estimate_matrix=loss_matrix_clean,
-        targets=targets_clean,
-        normalisation_factor=normalisation_factor,
-        **kwargs,  # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice.
-    )
-
-    # Extract household IDs for remaining households
-    household_ids = sim.calculate("household_id", 2024).values
-    remaining_households = household_ids[inclusion_mask]
-
-    # Create a smaller dataset with only the remaining households
-    df = sim.to_input_dataframe()
-    smaller_df = df[df["household_id__2024"].isin(remaining_households)]
-
-    weight_rel_change = (
-        smaller_df["household_weight__2024"].sum()
-        / df["household_weight__2024"].sum()
-    )
-    print(f"Weight relative change: {weight_rel_change:.2%}")
-
-    # Create new simulation with smaller dataset
-    sim = Microsimulation(dataset=smaller_df)
-
-    # Rescale weights to maintain total
-    initial_weights = (
-        sim.calculate("household_weight", 2024).values / weight_rel_change
-    )
-
-    # Re-calibrate the final selected households to hit targets
-    print("Re-calibrating final selected households...")
-    calibrated_weights = reweight(
-        initial_weights,
-        loss_matrix_clean.values,  # Convert to numpy array
-        targets_clean,
-        epochs=10,  # Reduced epochs for faster processing
-    )
-    sim.set_input("household_weight", 2024, calibrated_weights)
-    print("Final calibration completed successfully")
-    # Prepare data for saving
-    data = {}
-    for variable in sim.input_variables:
-        data[variable] = {2024: sim.calculate(variable, 2024).values}
-        if data[variable][2024].dtype == "object":
-            data[variable][2024] = data[variable][2024].astype("S")
-
-    # Save to HDF5 file
-    with h5py.File(output_path, "w") as f:
-        for variable, values in data.items():
-            for year, value in values.items():
-                f.create_dataset(f"{variable}/{year}", data=value)
-
-    print(f"Saved minimised dataset to {output_path}")
-    create_calibration_log_file(output_path, epoch=500)
-
-
-if __name__ == "__main__":
-    # Example usage
-    files = [
-        STORAGE_FOLDER / "enhanced_cps_2024.h5",
-    ]
-
-    for file in files:
-        output_path = file.with_name(file.stem + "_minimised.h5")
-        minimise_dataset(
-            file,
-            output_path,
-        )

From 096fb0f2c98b5387da0fc7827a42d75d288fbae9 Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Tue, 15 Jul 2025 15:15:49 -0400
Subject: [PATCH 51/58] more american spelling for debugging

---
 test_minimization_approach.ipynb | 73 +++-----------------------------
 1 file changed, 7 insertions(+), 66 deletions(-)

diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index ea561155..ab96f82f 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,12 +2,12 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 2,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n",
+    "from policyengine_us_data.utils.minimize import minimize_dataset, random_sampling_minimization, candidate_loss_contribution\n",
     "from policyengine_us_data.storage import STORAGE_FOLDER\n",
     "from policyengine_us import Microsimulation\n",
     "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
@@ -211,74 +211,15 @@
    "id": "69ff392d",
    "metadata": {},
    "source": [
-    "# Minimise.py approaches"
+    "# Minimize.py approaches"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": null,
    "id": "aeab67b3",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "=== CALIBRATION LOG DEBUG ===\n",
-      "File path: /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5\n",
-      "Epoch: 0\n",
-      "Number of households: 41310\n",
-      "Total weight: 12764381616743.21\n",
-      "Weight range: 0.54 to 1303728.75\n",
-      "Loss matrix shape: (41310, 2813)\n",
-      "Number of targets: 2813\n",
-      "After filtering bad targets:\n",
-      "Loss matrix clean shape: (41310, 2805)\n",
-      "Number of clean targets: 2805\n",
-      "Estimates shape: (2805,)\n",
-      "Estimates sum: 324584770671300.88\n",
-      "First 3 estimates: nation/irs/adjusted gross income/total/AGI in -inf-inf/taxable/All    1.498784e+13\n",
-      "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/All     1.609638e+10\n",
-      "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/All     6.707770e+10\n",
-      "dtype: float64\n",
-      "First 3 targets: [1.62972204e+13 1.68634879e+10 6.76819729e+10]\n",
-      "Mean absolute error: 17235490830.73\n",
-      "Mean relative error: 0.0997\n",
-      "=== END DEBUG ===\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 250/250 [03:38<00:00,  1.14it/s, loss=3.34e-5, loss_rel_change=-0.357]\n",
-      "100%|██████████| 250/250 [02:39<00:00,  1.57it/s, loss=3.52e-5, loss_rel_change=-0.334]\n",
-      "100%|██████████| 250/250 [01:32<00:00,  2.70it/s, loss=3.39e-5, loss_rel_change=-0.34] \n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Weight relative change: 99.95%\n",
-      "Re-calibrating final selected households...\n"
-     ]
-    },
-    {
-     "ename": "AttributeError",
-     "evalue": "'numpy.ndarray' object has no attribute 'columns'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[67], line 31\u001b[0m\n\u001b[1;32m     29\u001b[0m         output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     30\u001b[0m         output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 31\u001b[0m         \u001b[43mminimise_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     32\u001b[0m \u001b[43m            \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     33\u001b[0m \u001b[43m            \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     34\u001b[0m \u001b[43m            \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m     35\u001b[0m \u001b[43m            \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m     36\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m     38\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimise.py:430\u001b[0m, in \u001b[0;36mminimise_dataset\u001b[0;34m(dataset, output_path, minimization_function, **kwargs)\u001b[0m\n\u001b[1;32m    428\u001b[0m \u001b[38;5;66;03m# Re-calibrate the final selected households to hit targets\u001b[39;00m\n\u001b[1;32m    429\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRe-calibrating final selected households...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 430\u001b[0m calibrated_weights \u001b[38;5;241m=\u001b[39m \u001b[43mreweight\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    431\u001b[0m \u001b[43m    \u001b[49m\u001b[43minitial_weights\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    432\u001b[0m \u001b[43m    \u001b[49m\u001b[43mloss_matrix_clean\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Convert to numpy array\u001b[39;49;00m\n\u001b[1;32m    433\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtargets_clean\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    434\u001b[0m \u001b[43m    \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Reduced epochs for faster processing\u001b[39;49;00m\n\u001b[1;32m    435\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    436\u001b[0m sim\u001b[38;5;241m.\u001b[39mset_input(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m2024\u001b[39m, calibrated_weights)\n\u001b[1;32m    437\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFinal calibration completed successfully\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py:47\u001b[0m, in \u001b[0;36mreweight\u001b[0;34m(original_weights, loss_matrix, targets_array, dropout_rate, epochs, log_path, penalty_approach, penalty_weight)\u001b[0m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mreweight\u001b[39m(\n\u001b[1;32m     38\u001b[0m     original_weights,\n\u001b[1;32m     39\u001b[0m     loss_matrix,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     45\u001b[0m     penalty_weight\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     46\u001b[0m ):\n\u001b[0;32m---> 47\u001b[0m     target_names \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray(\u001b[43mloss_matrix\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m)\n\u001b[1;32m     48\u001b[0m     is_national \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnation/\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     49\u001b[0m     loss_matrix \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(loss_matrix\u001b[38;5;241m.\u001b[39mvalues, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
     "\n",
@@ -310,7 +251,7 @@
     "                for file in files:\n",
     "                    output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n",
     "                    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "                    minimise_dataset(\n",
+    "                    minimize_dataset(\n",
     "                        file,\n",
     "                        output_path,\n",
     "                        minimization_function=minimization_function, \n",
@@ -320,7 +261,7 @@
     "                for file in files:\n",
     "                    output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n",
     "                    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "                    minimise_dataset(\n",
+    "                    minimize_dataset(\n",
     "                        file,\n",
     "                        output_path,\n",
     "                        minimization_function=minimization_function, \n",

From 41980ac5947497f06942de6c9834bcaca26e7397 Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Tue, 15 Jul 2025 15:42:17 -0400
Subject: [PATCH 52/58] initial visualization with synthetic data

---
 test_minimization_approach.ipynb | 1373 +++++++++++++++++++++++++++++-
 1 file changed, 1363 insertions(+), 10 deletions(-)

diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index ab96f82f..9952ae4e 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,10 +2,19 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "d6dc9cca",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "from policyengine_us_data.utils.minimize import minimize_dataset, random_sampling_minimization, candidate_loss_contribution\n",
     "from policyengine_us_data.storage import STORAGE_FOLDER\n",
@@ -16,6 +25,7 @@
     "import os\n",
     "import h5py\n",
     "import pandas as pd\n",
+    "import plotly.express as px\n",
     "\n",
     "\n",
     "bad_targets = [\n",
@@ -216,10 +226,68 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "aeab67b3",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 31\u001b[0m\n\u001b[1;32m     29\u001b[0m         output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     30\u001b[0m         output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 31\u001b[0m         \u001b[43mminimize_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     32\u001b[0m \u001b[43m            \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     33\u001b[0m \u001b[43m            \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     34\u001b[0m \u001b[43m            \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m     35\u001b[0m \u001b[43m            \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m     36\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m     38\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:336\u001b[0m, in \u001b[0;36mminimize_dataset\u001b[0;34m(dataset, output_path, minimization_function, loss_matrix, targets, **kwargs)\u001b[0m\n\u001b[1;32m    333\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    334\u001b[0m     dataset_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(dataset)\n\u001b[0;32m--> 336\u001b[0m \u001b[43mcreate_calibration_log_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    338\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(dataset_path)\n\u001b[1;32m    339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m loss_matrix \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m targets \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:26\u001b[0m, in \u001b[0;36mcreate_calibration_log_file\u001b[0;34m(file_path, epoch)\u001b[0m\n\u001b[1;32m     23\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(file_path)\n\u001b[1;32m     24\u001b[0m sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39mdataset)\n\u001b[0;32m---> 26\u001b[0m loss_matrix, targets \u001b[38;5;241m=\u001b[39m \u001b[43mbuild_loss_matrix\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2024\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     28\u001b[0m bad_mask \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39misin(bad_targets)\n\u001b[1;32m     29\u001b[0m keep_mask_bool \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m~\u001b[39mbad_mask\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/loss.py:419\u001b[0m, in \u001b[0;36mbuild_loss_matrix\u001b[0;34m(dataset, time_period)\u001b[0m\n\u001b[1;32m    415\u001b[0m targets_array\u001b[38;5;241m.\u001b[39mappend(NET_WORTH_2024)\n\u001b[1;32m    417\u001b[0m \u001b[38;5;66;03m# SALT tax expenditure targeting\u001b[39;00m\n\u001b[0;32m--> 419\u001b[0m \u001b[43m_add_tax_expenditure_targets\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    420\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtime_period\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msim\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mloss_matrix\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtargets_array\u001b[49m\n\u001b[1;32m    421\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    423\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(loss_matrix\u001b[38;5;241m.\u001b[39misna()\u001b[38;5;241m.\u001b[39msum() \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m):\n\u001b[1;32m    424\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSome targets are missing from the loss matrix\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/loss.py:637\u001b[0m, in \u001b[0;36m_add_tax_expenditure_targets\u001b[0;34m(dataset, time_period, baseline_simulation, loss_matrix, targets_array)\u001b[0m\n\u001b[1;32m    634\u001b[0m simulation\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;241m=\u001b[39m time_period\n\u001b[1;32m    636\u001b[0m \u001b[38;5;66;03m# Calculate the baseline and reform income tax values.\u001b[39;00m\n\u001b[0;32m--> 637\u001b[0m income_tax_r \u001b[38;5;241m=\u001b[39m \u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    638\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mincome_tax\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhousehold\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m    639\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m    641\u001b[0m \u001b[38;5;66;03m# Compute the tax expenditure (TE) values.\u001b[39;00m\n\u001b[1;32m    642\u001b[0m te_values \u001b[38;5;241m=\u001b[39m income_tax_r \u001b[38;5;241m-\u001b[39m income_tax_b\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m   1003\u001b[0m     array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m   1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/income_tax.py:18\u001b[0m, in \u001b[0;36mincome_tax.formula\u001b[0;34m(person, period, parameters)\u001b[0m\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 18\u001b[0m     added_components \u001b[38;5;241m=\u001b[39m \u001b[43madd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[43m        \u001b[49m\u001b[43mperson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mincome_tax_before_refundable_credits\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m     20\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     21\u001b[0m     subtracted_components \u001b[38;5;241m=\u001b[39m add(\n\u001b[1;32m     22\u001b[0m         person, period, [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mincome_tax_refundable_credits\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m     23\u001b[0m     )\n\u001b[1;32m     24\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m added_components \u001b[38;5;241m-\u001b[39m subtracted_components\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:227\u001b[0m, in \u001b[0;36madd\u001b[0;34m(entity, period, variables, options)\u001b[0m\n\u001b[1;32m    207\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21madd\u001b[39m(\n\u001b[1;32m    208\u001b[0m     entity: Population,\n\u001b[1;32m    209\u001b[0m     period: Period,\n\u001b[1;32m    210\u001b[0m     variables: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m    211\u001b[0m     options: List[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    212\u001b[0m ):\n\u001b[1;32m    213\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Sums a list of variables.\u001b[39;00m\n\u001b[1;32m    214\u001b[0m \n\u001b[1;32m    215\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    225\u001b[0m \u001b[38;5;124;03m        ArrayLike: The result of the operation.\u001b[39;00m\n\u001b[1;32m    226\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfor_each_variable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    228\u001b[0m \u001b[43m        \u001b[49m\u001b[43mentity\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magg_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43madd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\n\u001b[1;32m    229\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:183\u001b[0m, in \u001b[0;36mfor_each_variable\u001b[0;34m(entity, period, variables, agg_func, group_agg_func, options)\u001b[0m\n\u001b[1;32m    181\u001b[0m variable_entity \u001b[38;5;241m=\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mget_variable(variable)\u001b[38;5;241m.\u001b[39mentity\n\u001b[1;32m    182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;241m==\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mkey:\n\u001b[0;32m--> 183\u001b[0m     values \u001b[38;5;241m=\u001b[39m \u001b[43mentity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mis_person:\n\u001b[1;32m    185\u001b[0m     values \u001b[38;5;241m=\u001b[39m group_agg_func(\n\u001b[1;32m    186\u001b[0m         entity\u001b[38;5;241m.\u001b[39mmembers(variable, period, options\u001b[38;5;241m=\u001b[39moptions)\n\u001b[1;32m    187\u001b[0m     )\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m    134\u001b[0m         variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m    135\u001b[0m     )\n\u001b[1;32m    136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m   1003\u001b[0m     array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m   1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/income_tax_before_refundable_credits.py:18\u001b[0m, in \u001b[0;36mincome_tax_before_refundable_credits.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 18\u001b[0m     added_components \u001b[38;5;241m=\u001b[39m \u001b[43madd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtax_unit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     20\u001b[0m \u001b[43m        \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     21\u001b[0m \u001b[43m        \u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m     22\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mincome_tax_before_credits\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     23\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnet_investment_income_tax\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     24\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrecapture_of_investment_credit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     25\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43munreported_payroll_tax\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     26\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mqualified_retirement_penalty\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     27\u001b[0m \u001b[43m        \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     28\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     29\u001b[0m     subtracted_components \u001b[38;5;241m=\u001b[39m add(\n\u001b[1;32m     30\u001b[0m         tax_unit, period, [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mincome_tax_capped_non_refundable_credits\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m     31\u001b[0m     )\n\u001b[1;32m     32\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m added_components \u001b[38;5;241m-\u001b[39m subtracted_components\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:227\u001b[0m, in \u001b[0;36madd\u001b[0;34m(entity, period, variables, options)\u001b[0m\n\u001b[1;32m    207\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21madd\u001b[39m(\n\u001b[1;32m    208\u001b[0m     entity: Population,\n\u001b[1;32m    209\u001b[0m     period: Period,\n\u001b[1;32m    210\u001b[0m     variables: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m    211\u001b[0m     options: List[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    212\u001b[0m ):\n\u001b[1;32m    213\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Sums a list of variables.\u001b[39;00m\n\u001b[1;32m    214\u001b[0m \n\u001b[1;32m    215\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    225\u001b[0m \u001b[38;5;124;03m        ArrayLike: The result of the operation.\u001b[39;00m\n\u001b[1;32m    226\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfor_each_variable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    228\u001b[0m \u001b[43m        \u001b[49m\u001b[43mentity\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magg_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43madd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\n\u001b[1;32m    229\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:183\u001b[0m, in \u001b[0;36mfor_each_variable\u001b[0;34m(entity, period, variables, agg_func, group_agg_func, options)\u001b[0m\n\u001b[1;32m    181\u001b[0m variable_entity \u001b[38;5;241m=\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mget_variable(variable)\u001b[38;5;241m.\u001b[39mentity\n\u001b[1;32m    182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;241m==\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mkey:\n\u001b[0;32m--> 183\u001b[0m     values \u001b[38;5;241m=\u001b[39m \u001b[43mentity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mis_person:\n\u001b[1;32m    185\u001b[0m     values \u001b[38;5;241m=\u001b[39m group_agg_func(\n\u001b[1;32m    186\u001b[0m         entity\u001b[38;5;241m.\u001b[39mmembers(variable, period, options\u001b[38;5;241m=\u001b[39moptions)\n\u001b[1;32m    187\u001b[0m     )\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m    134\u001b[0m         variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m    135\u001b[0m     )\n\u001b[1;32m    136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:940\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m    938\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m adds_list:\n\u001b[1;32m    939\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mvariables:\n\u001b[0;32m--> 940\u001b[0m         values \u001b[38;5;241m=\u001b[39m values \u001b[38;5;241m+\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    941\u001b[0m \u001b[43m            \u001b[49m\u001b[43madded_variable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mentity\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkey\u001b[49m\n\u001b[1;32m    942\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    943\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    944\u001b[0m         \u001b[38;5;28;01mtry\u001b[39;00m:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m   1003\u001b[0m     array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m   1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/alternative_minimum_tax/alternative_minimum_tax.py:24\u001b[0m, in \u001b[0;36malternative_minimum_tax.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m     21\u001b[0m amt_base_tax \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mamt_base_tax\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m     23\u001b[0m \u001b[38;5;66;03m# Tax on capital gains (Part III)\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m form_6251_part_iii_required \u001b[38;5;241m=\u001b[39m \u001b[43mtax_unit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mamt_part_iii_required\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     26\u001b[0m amt_tax_including_cg \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mamt_tax_including_cg\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m     27\u001b[0m smaller_tax \u001b[38;5;241m=\u001b[39m min_(amt_base_tax, amt_tax_including_cg)\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m    134\u001b[0m         variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m    135\u001b[0m     )\n\u001b[1;32m    136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m   1003\u001b[0m     array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m   1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/alternative_minimum_tax/amt_part_iii_required.py:13\u001b[0m, in \u001b[0;36mamt_part_iii_required.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mformula\u001b[39m(tax_unit, period, parameters):\n\u001b[0;32m---> 13\u001b[0m     relevant_inputs \u001b[38;5;241m=\u001b[39m \u001b[43madd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     14\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtax_unit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     15\u001b[0m \u001b[43m        \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     16\u001b[0m \u001b[43m        \u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m     17\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks10\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     18\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks13\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks14\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     20\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks19\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     21\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43munrecaptured_section_1250_gain\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     22\u001b[0m \u001b[43m        \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     23\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     24\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m relevant_inputs \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:227\u001b[0m, in \u001b[0;36madd\u001b[0;34m(entity, period, variables, options)\u001b[0m\n\u001b[1;32m    207\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21madd\u001b[39m(\n\u001b[1;32m    208\u001b[0m     entity: Population,\n\u001b[1;32m    209\u001b[0m     period: Period,\n\u001b[1;32m    210\u001b[0m     variables: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m    211\u001b[0m     options: List[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    212\u001b[0m ):\n\u001b[1;32m    213\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Sums a list of variables.\u001b[39;00m\n\u001b[1;32m    214\u001b[0m \n\u001b[1;32m    215\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    225\u001b[0m \u001b[38;5;124;03m        ArrayLike: The result of the operation.\u001b[39;00m\n\u001b[1;32m    226\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfor_each_variable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    228\u001b[0m \u001b[43m        \u001b[49m\u001b[43mentity\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magg_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43madd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\n\u001b[1;32m    229\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:183\u001b[0m, in \u001b[0;36mfor_each_variable\u001b[0;34m(entity, period, variables, agg_func, group_agg_func, options)\u001b[0m\n\u001b[1;32m    181\u001b[0m variable_entity \u001b[38;5;241m=\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mget_variable(variable)\u001b[38;5;241m.\u001b[39mentity\n\u001b[1;32m    182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;241m==\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mkey:\n\u001b[0;32m--> 183\u001b[0m     values \u001b[38;5;241m=\u001b[39m \u001b[43mentity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mis_person:\n\u001b[1;32m    185\u001b[0m     values \u001b[38;5;241m=\u001b[39m group_agg_func(\n\u001b[1;32m    186\u001b[0m         entity\u001b[38;5;241m.\u001b[39mmembers(variable, period, options\u001b[38;5;241m=\u001b[39moptions)\n\u001b[1;32m    187\u001b[0m     )\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m    134\u001b[0m         variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m    135\u001b[0m     )\n\u001b[1;32m    136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:612\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    610\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVariable \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvariable_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not exist.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    611\u001b[0m population \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_variable_population(variable_name)\n\u001b[0;32m--> 612\u001b[0m holder \u001b[38;5;241m=\u001b[39m \u001b[43mpopulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_holder\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    613\u001b[0m variable \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mget_variable(\n\u001b[1;32m    614\u001b[0m     variable_name, check_existence\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m    615\u001b[0m )\n\u001b[1;32m    617\u001b[0m \u001b[38;5;66;03m# Check if we've neutralized via parameters.\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:145\u001b[0m, in \u001b[0;36mPopulation.get_holder\u001b[0;34m(self, variable_name)\u001b[0m\n\u001b[1;32m    143\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mget_holder\u001b[39m(\u001b[38;5;28mself\u001b[39m, variable_name: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Holder:\n\u001b[1;32m    144\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mcheck_variable_defined_for_entity(variable_name)\n\u001b[0;32m--> 145\u001b[0m     holder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_holders\u001b[38;5;241m.\u001b[39mget(variable_name)\n\u001b[1;32m    146\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m holder:\n\u001b[1;32m    147\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m holder\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
    "source": [
     "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n",
     "\n",
@@ -711,10 +779,43 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "9602953a",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "      strategy parameter  dataset_size  total_loss\n",
+      "0         none      none         41310      0.0069\n",
+      "1   l0_sigmoid       1.0         41310      0.0069\n",
+      "2   l0_sigmoid       0.1         37048      5.8041\n",
+      "3   l0_sigmoid      0.01         32786     13.3255\n",
+      "4   l0_sigmoid     0.001         28524     21.6723\n",
+      "5   l0_sigmoid    0.0001         24262     30.6049\n",
+      "6   l0_sigmoid   0.00001         20000     40.0000\n",
+      "7       l0_log       1.0         41310      0.0069\n",
+      "8       l0_log       0.1         37048      8.7028\n",
+      "9       l0_log      0.01         32786     19.9847\n",
+      "10      l0_log     0.001         28524     32.5050\n",
+      "11      l0_log    0.0001         24262     45.9039\n",
+      "12      l0_log   0.00001         20000     59.9965\n",
+      "13      l0_exp       1.0         41310      0.0069\n",
+      "14      l0_exp       0.1         37048     11.6014\n",
+      "15      l0_exp      0.01         32786     26.6440\n",
+      "16      l0_exp     0.001         28524     43.3377\n",
+      "17      l0_exp    0.0001         24262     61.2029\n",
+      "18      l0_exp   0.00001         20000     79.9931\n",
+      "19          l1       1.0         41310      0.0069\n",
+      "20          l1       0.1         37048     14.5000\n",
+      "21          l1      0.01         32786     33.3033\n",
+      "22          l1     0.001         28524     54.1704\n",
+      "23          l1    0.0001         24262     76.5019\n",
+      "24          l1   0.00001         20000     99.9896\n"
+     ]
+    }
+   ],
    "source": [
     "'''\n",
     "Synthetic dataset\n",
@@ -730,14 +831,28 @@
     "base_loss = 0.0069\n",
     "max_loss = 40.0\n",
     "\n",
-    "# Construct rows\n",
+    "strategy_slopes = {\n",
+    "    'l0_sigmoid': 1.0,\n",
+    "    'l0_log': 1.5,\n",
+    "    'l0_exp': 2.0,\n",
+    "    'l1': 2.5,\n",
+    "}\n",
+    "\n",
     "rows = [{'strategy': 'none', 'parameter': 'none', 'dataset_size': base_size, 'total_loss': base_loss}]\n",
     "\n",
     "for strategy in strategies:\n",
+    "    slope = strategy_slopes[strategy]\n",
+    "    \n",
     "    for i, param in enumerate(parameters):\n",
-    "        # Gradually decrease size and increase loss\n",
-    "        size = int(base_size - (base_size - min_size) * (i / (len(parameters) - 1)))\n",
-    "        loss = round(base_loss + (max_loss - base_loss) * (i / (len(parameters) - 1)), 4)\n",
+    "        # Normalized compression level: 0 (no compression) to 1 (max compression)\n",
+    "        compression_level = i / (len(parameters) - 1)\n",
+    "        \n",
+    "        # Size shrinks linearly\n",
+    "        size = int(base_size - (base_size - min_size) * compression_level)\n",
+    "        \n",
+    "        # Loss increases quadratically (or linearly) based on strategy slope\n",
+    "        loss = round(base_loss + slope * (max_loss - base_loss) * (compression_level ** 1.2), 4)\n",
+    "        \n",
     "        rows.append({\n",
     "            'strategy': strategy,\n",
     "            'parameter': param,\n",
@@ -751,6 +866,1244 @@
     "# Display\n",
     "print(reg_results_df)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "2dc0891c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.plotly.v1+json": {
+       "config": {
+        "plotlyServerURL": "https://plot.ly"
+       },
+       "data": [
+        {
+         "customdata": [
+          [
+           "l0_exp"
+          ],
+          [
+           "l0_exp"
+          ],
+          [
+           "l0_exp"
+          ],
+          [
+           "l0_exp"
+          ],
+          [
+           "l0_exp"
+          ],
+          [
+           "l0_exp"
+          ]
+         ],
+         "hovertemplate": "Strategy: %{customdata[0]}<br>Size: %{x}<br>Loss: %{y:.4f}<br> Param: %{text}",
+         "legendgroup": "l0_exp",
+         "line": {
+          "color": "#636efa",
+          "dash": "solid"
+         },
+         "marker": {
+          "symbol": "circle"
+         },
+         "mode": "lines+markers+text",
+         "name": "l0_exp",
+         "orientation": "v",
+         "showlegend": true,
+         "text": [
+          "1.0",
+          "0.1",
+          "0.01",
+          "0.001",
+          "0.0001",
+          "1e-05"
+         ],
+         "textposition": "top center",
+         "type": "scatter",
+         "x": [
+          41310,
+          37048,
+          32786,
+          28524,
+          24262,
+          20000
+         ],
+         "xaxis": "x",
+         "y": [
+          0.0069,
+          11.6014,
+          26.644,
+          43.3377,
+          61.2029,
+          79.9931
+         ],
+         "yaxis": "y"
+        },
+        {
+         "customdata": [
+          [
+           "l0_log"
+          ],
+          [
+           "l0_log"
+          ],
+          [
+           "l0_log"
+          ],
+          [
+           "l0_log"
+          ],
+          [
+           "l0_log"
+          ],
+          [
+           "l0_log"
+          ]
+         ],
+         "hovertemplate": "Strategy: %{customdata[0]}<br>Size: %{x}<br>Loss: %{y:.4f}<br> Param: %{text}",
+         "legendgroup": "l0_log",
+         "line": {
+          "color": "#EF553B",
+          "dash": "solid"
+         },
+         "marker": {
+          "symbol": "circle"
+         },
+         "mode": "lines+markers+text",
+         "name": "l0_log",
+         "orientation": "v",
+         "showlegend": true,
+         "text": [
+          "1.0",
+          "0.1",
+          "0.01",
+          "0.001",
+          "0.0001",
+          "1e-05"
+         ],
+         "textposition": "top center",
+         "type": "scatter",
+         "x": [
+          41310,
+          37048,
+          32786,
+          28524,
+          24262,
+          20000
+         ],
+         "xaxis": "x",
+         "y": [
+          0.0069,
+          8.7028,
+          19.9847,
+          32.505,
+          45.9039,
+          59.9965
+         ],
+         "yaxis": "y"
+        },
+        {
+         "customdata": [
+          [
+           "l0_sigmoid"
+          ],
+          [
+           "l0_sigmoid"
+          ],
+          [
+           "l0_sigmoid"
+          ],
+          [
+           "l0_sigmoid"
+          ],
+          [
+           "l0_sigmoid"
+          ],
+          [
+           "l0_sigmoid"
+          ]
+         ],
+         "hovertemplate": "Strategy: %{customdata[0]}<br>Size: %{x}<br>Loss: %{y:.4f}<br> Param: %{text}",
+         "legendgroup": "l0_sigmoid",
+         "line": {
+          "color": "#00cc96",
+          "dash": "solid"
+         },
+         "marker": {
+          "symbol": "circle"
+         },
+         "mode": "lines+markers+text",
+         "name": "l0_sigmoid",
+         "orientation": "v",
+         "showlegend": true,
+         "text": [
+          "1.0",
+          "0.1",
+          "0.01",
+          "0.001",
+          "0.0001",
+          "1e-05"
+         ],
+         "textposition": "top center",
+         "type": "scatter",
+         "x": [
+          41310,
+          37048,
+          32786,
+          28524,
+          24262,
+          20000
+         ],
+         "xaxis": "x",
+         "y": [
+          0.0069,
+          5.8041,
+          13.3255,
+          21.6723,
+          30.6049,
+          40
+         ],
+         "yaxis": "y"
+        },
+        {
+         "customdata": [
+          [
+           "l1"
+          ],
+          [
+           "l1"
+          ],
+          [
+           "l1"
+          ],
+          [
+           "l1"
+          ],
+          [
+           "l1"
+          ],
+          [
+           "l1"
+          ]
+         ],
+         "hovertemplate": "Strategy: %{customdata[0]}<br>Size: %{x}<br>Loss: %{y:.4f}<br> Param: %{text}",
+         "legendgroup": "l1",
+         "line": {
+          "color": "#ab63fa",
+          "dash": "solid"
+         },
+         "marker": {
+          "symbol": "circle"
+         },
+         "mode": "lines+markers+text",
+         "name": "l1",
+         "orientation": "v",
+         "showlegend": true,
+         "text": [
+          "1.0",
+          "0.1",
+          "0.01",
+          "0.001",
+          "0.0001",
+          "1e-05"
+         ],
+         "textposition": "top center",
+         "type": "scatter",
+         "x": [
+          41310,
+          37048,
+          32786,
+          28524,
+          24262,
+          20000
+         ],
+         "xaxis": "x",
+         "y": [
+          0.0069,
+          14.5,
+          33.3033,
+          54.1704,
+          76.5019,
+          99.9896
+         ],
+         "yaxis": "y"
+        }
+       ],
+       "layout": {
+        "annotations": [
+         {
+          "arrowhead": 1,
+          "ax": 40,
+          "ay": -40,
+          "font": {
+           "color": "gray"
+          },
+          "showarrow": true,
+          "text": "Baseline",
+          "x": 41310,
+          "y": 0.0069
+         }
+        ],
+        "height": 600,
+        "hovermode": "closest",
+        "legend": {
+         "title": {
+          "text": "Strategy"
+         },
+         "tracegroupgap": 0
+        },
+        "shapes": [
+         {
+          "line": {
+           "color": "gray",
+           "dash": "dash"
+          },
+          "name": "Baseline Size",
+          "type": "line",
+          "x0": 41310,
+          "x1": 41310,
+          "y0": 0.0069,
+          "y1": 99.9896
+         },
+         {
+          "line": {
+           "color": "gray",
+           "dash": "dash"
+          },
+          "name": "Baseline Loss",
+          "type": "line",
+          "x0": 20000,
+          "x1": 41310,
+          "y0": 0.0069,
+          "y1": 0.0069
+         }
+        ],
+        "template": {
+         "data": {
+          "bar": [
+           {
+            "error_x": {
+             "color": "#2a3f5f"
+            },
+            "error_y": {
+             "color": "#2a3f5f"
+            },
+            "marker": {
+             "line": {
+              "color": "#E5ECF6",
+              "width": 0.5
+             },
+             "pattern": {
+              "fillmode": "overlay",
+              "size": 10,
+              "solidity": 0.2
+             }
+            },
+            "type": "bar"
+           }
+          ],
+          "barpolar": [
+           {
+            "marker": {
+             "line": {
+              "color": "#E5ECF6",
+              "width": 0.5
+             },
+             "pattern": {
+              "fillmode": "overlay",
+              "size": 10,
+              "solidity": 0.2
+             }
+            },
+            "type": "barpolar"
+           }
+          ],
+          "carpet": [
+           {
+            "aaxis": {
+             "endlinecolor": "#2a3f5f",
+             "gridcolor": "white",
+             "linecolor": "white",
+             "minorgridcolor": "white",
+             "startlinecolor": "#2a3f5f"
+            },
+            "baxis": {
+             "endlinecolor": "#2a3f5f",
+             "gridcolor": "white",
+             "linecolor": "white",
+             "minorgridcolor": "white",
+             "startlinecolor": "#2a3f5f"
+            },
+            "type": "carpet"
+           }
+          ],
+          "choropleth": [
+           {
+            "colorbar": {
+             "outlinewidth": 0,
+             "ticks": ""
+            },
+            "type": "choropleth"
+           }
+          ],
+          "contour": [
+           {
+            "colorbar": {
+             "outlinewidth": 0,
+             "ticks": ""
+            },
+            "colorscale": [
+             [
+              0,
+              "#0d0887"
+             ],
+             [
+              0.1111111111111111,
+              "#46039f"
+             ],
+             [
+              0.2222222222222222,
+              "#7201a8"
+             ],
+             [
+              0.3333333333333333,
+              "#9c179e"
+             ],
+             [
+              0.4444444444444444,
+              "#bd3786"
+             ],
+             [
+              0.5555555555555556,
+              "#d8576b"
+             ],
+             [
+              0.6666666666666666,
+              "#ed7953"
+             ],
+             [
+              0.7777777777777778,
+              "#fb9f3a"
+             ],
+             [
+              0.8888888888888888,
+              "#fdca26"
+             ],
+             [
+              1,
+              "#f0f921"
+             ]
+            ],
+            "type": "contour"
+           }
+          ],
+          "contourcarpet": [
+           {
+            "colorbar": {
+             "outlinewidth": 0,
+             "ticks": ""
+            },
+            "type": "contourcarpet"
+           }
+          ],
+          "heatmap": [
+           {
+            "colorbar": {
+             "outlinewidth": 0,
+             "ticks": ""
+            },
+            "colorscale": [
+             [
+              0,
+              "#0d0887"
+             ],
+             [
+              0.1111111111111111,
+              "#46039f"
+             ],
+             [
+              0.2222222222222222,
+              "#7201a8"
+             ],
+             [
+              0.3333333333333333,
+              "#9c179e"
+             ],
+             [
+              0.4444444444444444,
+              "#bd3786"
+             ],
+             [
+              0.5555555555555556,
+              "#d8576b"
+             ],
+             [
+              0.6666666666666666,
+              "#ed7953"
+             ],
+             [
+              0.7777777777777778,
+              "#fb9f3a"
+             ],
+             [
+              0.8888888888888888,
+              "#fdca26"
+             ],
+             [
+              1,
+              "#f0f921"
+             ]
+            ],
+            "type": "heatmap"
+           }
+          ],
+          "heatmapgl": [
+           {
+            "colorbar": {
+             "outlinewidth": 0,
+             "ticks": ""
+            },
+            "colorscale": [
+             [
+              0,
+              "#0d0887"
+             ],
+             [
+              0.1111111111111111,
+              "#46039f"
+             ],
+             [
+              0.2222222222222222,
+              "#7201a8"
+             ],
+             [
+              0.3333333333333333,
+              "#9c179e"
+             ],
+             [
+              0.4444444444444444,
+              "#bd3786"
+             ],
+             [
+              0.5555555555555556,
+              "#d8576b"
+             ],
+             [
+              0.6666666666666666,
+              "#ed7953"
+             ],
+             [
+              0.7777777777777778,
+              "#fb9f3a"
+             ],
+             [
+              0.8888888888888888,
+              "#fdca26"
+             ],
+             [
+              1,
+              "#f0f921"
+             ]
+            ],
+            "type": "heatmapgl"
+           }
+          ],
+          "histogram": [
+           {
+            "marker": {
+             "pattern": {
+              "fillmode": "overlay",
+              "size": 10,
+              "solidity": 0.2
+             }
+            },
+            "type": "histogram"
+           }
+          ],
+          "histogram2d": [
+           {
+            "colorbar": {
+             "outlinewidth": 0,
+             "ticks": ""
+            },
+            "colorscale": [
+             [
+              0,
+              "#0d0887"
+             ],
+             [
+              0.1111111111111111,
+              "#46039f"
+             ],
+             [
+              0.2222222222222222,
+              "#7201a8"
+             ],
+             [
+              0.3333333333333333,
+              "#9c179e"
+             ],
+             [
+              0.4444444444444444,
+              "#bd3786"
+             ],
+             [
+              0.5555555555555556,
+              "#d8576b"
+             ],
+             [
+              0.6666666666666666,
+              "#ed7953"
+             ],
+             [
+              0.7777777777777778,
+              "#fb9f3a"
+             ],
+             [
+              0.8888888888888888,
+              "#fdca26"
+             ],
+             [
+              1,
+              "#f0f921"
+             ]
+            ],
+            "type": "histogram2d"
+           }
+          ],
+          "histogram2dcontour": [
+           {
+            "colorbar": {
+             "outlinewidth": 0,
+             "ticks": ""
+            },
+            "colorscale": [
+             [
+              0,
+              "#0d0887"
+             ],
+             [
+              0.1111111111111111,
+              "#46039f"
+             ],
+             [
+              0.2222222222222222,
+              "#7201a8"
+             ],
+             [
+              0.3333333333333333,
+              "#9c179e"
+             ],
+             [
+              0.4444444444444444,
+              "#bd3786"
+             ],
+             [
+              0.5555555555555556,
+              "#d8576b"
+             ],
+             [
+              0.6666666666666666,
+              "#ed7953"
+             ],
+             [
+              0.7777777777777778,
+              "#fb9f3a"
+             ],
+             [
+              0.8888888888888888,
+              "#fdca26"
+             ],
+             [
+              1,
+              "#f0f921"
+             ]
+            ],
+            "type": "histogram2dcontour"
+           }
+          ],
+          "mesh3d": [
+           {
+            "colorbar": {
+             "outlinewidth": 0,
+             "ticks": ""
+            },
+            "type": "mesh3d"
+           }
+          ],
+          "parcoords": [
+           {
+            "line": {
+             "colorbar": {
+              "outlinewidth": 0,
+              "ticks": ""
+             }
+            },
+            "type": "parcoords"
+           }
+          ],
+          "pie": [
+           {
+            "automargin": true,
+            "type": "pie"
+           }
+          ],
+          "scatter": [
+           {
+            "fillpattern": {
+             "fillmode": "overlay",
+             "size": 10,
+             "solidity": 0.2
+            },
+            "type": "scatter"
+           }
+          ],
+          "scatter3d": [
+           {
+            "line": {
+             "colorbar": {
+              "outlinewidth": 0,
+              "ticks": ""
+             }
+            },
+            "marker": {
+             "colorbar": {
+              "outlinewidth": 0,
+              "ticks": ""
+             }
+            },
+            "type": "scatter3d"
+           }
+          ],
+          "scattercarpet": [
+           {
+            "marker": {
+             "colorbar": {
+              "outlinewidth": 0,
+              "ticks": ""
+             }
+            },
+            "type": "scattercarpet"
+           }
+          ],
+          "scattergeo": [
+           {
+            "marker": {
+             "colorbar": {
+              "outlinewidth": 0,
+              "ticks": ""
+             }
+            },
+            "type": "scattergeo"
+           }
+          ],
+          "scattergl": [
+           {
+            "marker": {
+             "colorbar": {
+              "outlinewidth": 0,
+              "ticks": ""
+             }
+            },
+            "type": "scattergl"
+           }
+          ],
+          "scattermapbox": [
+           {
+            "marker": {
+             "colorbar": {
+              "outlinewidth": 0,
+              "ticks": ""
+             }
+            },
+            "type": "scattermapbox"
+           }
+          ],
+          "scatterpolar": [
+           {
+            "marker": {
+             "colorbar": {
+              "outlinewidth": 0,
+              "ticks": ""
+             }
+            },
+            "type": "scatterpolar"
+           }
+          ],
+          "scatterpolargl": [
+           {
+            "marker": {
+             "colorbar": {
+              "outlinewidth": 0,
+              "ticks": ""
+             }
+            },
+            "type": "scatterpolargl"
+           }
+          ],
+          "scatterternary": [
+           {
+            "marker": {
+             "colorbar": {
+              "outlinewidth": 0,
+              "ticks": ""
+             }
+            },
+            "type": "scatterternary"
+           }
+          ],
+          "surface": [
+           {
+            "colorbar": {
+             "outlinewidth": 0,
+             "ticks": ""
+            },
+            "colorscale": [
+             [
+              0,
+              "#0d0887"
+             ],
+             [
+              0.1111111111111111,
+              "#46039f"
+             ],
+             [
+              0.2222222222222222,
+              "#7201a8"
+             ],
+             [
+              0.3333333333333333,
+              "#9c179e"
+             ],
+             [
+              0.4444444444444444,
+              "#bd3786"
+             ],
+             [
+              0.5555555555555556,
+              "#d8576b"
+             ],
+             [
+              0.6666666666666666,
+              "#ed7953"
+             ],
+             [
+              0.7777777777777778,
+              "#fb9f3a"
+             ],
+             [
+              0.8888888888888888,
+              "#fdca26"
+             ],
+             [
+              1,
+              "#f0f921"
+             ]
+            ],
+            "type": "surface"
+           }
+          ],
+          "table": [
+           {
+            "cells": {
+             "fill": {
+              "color": "#EBF0F8"
+             },
+             "line": {
+              "color": "white"
+             }
+            },
+            "header": {
+             "fill": {
+              "color": "#C8D4E3"
+             },
+             "line": {
+              "color": "white"
+             }
+            },
+            "type": "table"
+           }
+          ]
+         },
+         "layout": {
+          "annotationdefaults": {
+           "arrowcolor": "#2a3f5f",
+           "arrowhead": 0,
+           "arrowwidth": 1
+          },
+          "autotypenumbers": "strict",
+          "coloraxis": {
+           "colorbar": {
+            "outlinewidth": 0,
+            "ticks": ""
+           }
+          },
+          "colorscale": {
+           "diverging": [
+            [
+             0,
+             "#8e0152"
+            ],
+            [
+             0.1,
+             "#c51b7d"
+            ],
+            [
+             0.2,
+             "#de77ae"
+            ],
+            [
+             0.3,
+             "#f1b6da"
+            ],
+            [
+             0.4,
+             "#fde0ef"
+            ],
+            [
+             0.5,
+             "#f7f7f7"
+            ],
+            [
+             0.6,
+             "#e6f5d0"
+            ],
+            [
+             0.7,
+             "#b8e186"
+            ],
+            [
+             0.8,
+             "#7fbc41"
+            ],
+            [
+             0.9,
+             "#4d9221"
+            ],
+            [
+             1,
+             "#276419"
+            ]
+           ],
+           "sequential": [
+            [
+             0,
+             "#0d0887"
+            ],
+            [
+             0.1111111111111111,
+             "#46039f"
+            ],
+            [
+             0.2222222222222222,
+             "#7201a8"
+            ],
+            [
+             0.3333333333333333,
+             "#9c179e"
+            ],
+            [
+             0.4444444444444444,
+             "#bd3786"
+            ],
+            [
+             0.5555555555555556,
+             "#d8576b"
+            ],
+            [
+             0.6666666666666666,
+             "#ed7953"
+            ],
+            [
+             0.7777777777777778,
+             "#fb9f3a"
+            ],
+            [
+             0.8888888888888888,
+             "#fdca26"
+            ],
+            [
+             1,
+             "#f0f921"
+            ]
+           ],
+           "sequentialminus": [
+            [
+             0,
+             "#0d0887"
+            ],
+            [
+             0.1111111111111111,
+             "#46039f"
+            ],
+            [
+             0.2222222222222222,
+             "#7201a8"
+            ],
+            [
+             0.3333333333333333,
+             "#9c179e"
+            ],
+            [
+             0.4444444444444444,
+             "#bd3786"
+            ],
+            [
+             0.5555555555555556,
+             "#d8576b"
+            ],
+            [
+             0.6666666666666666,
+             "#ed7953"
+            ],
+            [
+             0.7777777777777778,
+             "#fb9f3a"
+            ],
+            [
+             0.8888888888888888,
+             "#fdca26"
+            ],
+            [
+             1,
+             "#f0f921"
+            ]
+           ]
+          },
+          "colorway": [
+           "#636efa",
+           "#EF553B",
+           "#00cc96",
+           "#ab63fa",
+           "#FFA15A",
+           "#19d3f3",
+           "#FF6692",
+           "#B6E880",
+           "#FF97FF",
+           "#FECB52"
+          ],
+          "font": {
+           "color": "#2a3f5f"
+          },
+          "geo": {
+           "bgcolor": "white",
+           "lakecolor": "white",
+           "landcolor": "#E5ECF6",
+           "showlakes": true,
+           "showland": true,
+           "subunitcolor": "white"
+          },
+          "hoverlabel": {
+           "align": "left"
+          },
+          "hovermode": "closest",
+          "mapbox": {
+           "style": "light"
+          },
+          "paper_bgcolor": "white",
+          "plot_bgcolor": "#E5ECF6",
+          "polar": {
+           "angularaxis": {
+            "gridcolor": "white",
+            "linecolor": "white",
+            "ticks": ""
+           },
+           "bgcolor": "#E5ECF6",
+           "radialaxis": {
+            "gridcolor": "white",
+            "linecolor": "white",
+            "ticks": ""
+           }
+          },
+          "scene": {
+           "xaxis": {
+            "backgroundcolor": "#E5ECF6",
+            "gridcolor": "white",
+            "gridwidth": 2,
+            "linecolor": "white",
+            "showbackground": true,
+            "ticks": "",
+            "zerolinecolor": "white"
+           },
+           "yaxis": {
+            "backgroundcolor": "#E5ECF6",
+            "gridcolor": "white",
+            "gridwidth": 2,
+            "linecolor": "white",
+            "showbackground": true,
+            "ticks": "",
+            "zerolinecolor": "white"
+           },
+           "zaxis": {
+            "backgroundcolor": "#E5ECF6",
+            "gridcolor": "white",
+            "gridwidth": 2,
+            "linecolor": "white",
+            "showbackground": true,
+            "ticks": "",
+            "zerolinecolor": "white"
+           }
+          },
+          "shapedefaults": {
+           "line": {
+            "color": "#2a3f5f"
+           }
+          },
+          "ternary": {
+           "aaxis": {
+            "gridcolor": "white",
+            "linecolor": "white",
+            "ticks": ""
+           },
+           "baxis": {
+            "gridcolor": "white",
+            "linecolor": "white",
+            "ticks": ""
+           },
+           "bgcolor": "#E5ECF6",
+           "caxis": {
+            "gridcolor": "white",
+            "linecolor": "white",
+            "ticks": ""
+           }
+          },
+          "title": {
+           "x": 0.05
+          },
+          "xaxis": {
+           "automargin": true,
+           "gridcolor": "white",
+           "linecolor": "white",
+           "ticks": "",
+           "title": {
+            "standoff": 15
+           },
+           "zerolinecolor": "white",
+           "zerolinewidth": 2
+          },
+          "yaxis": {
+           "automargin": true,
+           "gridcolor": "white",
+           "linecolor": "white",
+           "ticks": "",
+           "title": {
+            "standoff": 15
+           },
+           "zerolinecolor": "white",
+           "zerolinewidth": 2
+          }
+         }
+        },
+        "title": {
+         "text": "ECPS Regularization Strategy Comparison"
+        },
+        "width": 900,
+        "xaxis": {
+         "anchor": "y",
+         "domain": [
+          0,
+          1
+         ],
+         "title": {
+          "text": "Number of Households"
+         }
+        },
+        "yaxis": {
+         "anchor": "x",
+         "domain": [
+          0,
+          1
+         ],
+         "title": {
+          "text": "Calibration Score"
+         }
+        }
+       }
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Filter out the baseline row\n",
+    "df_plot = reg_results_df[reg_results_df['strategy'] != 'none'].copy()\n",
+    "df_plot['parameter'] = df_plot['parameter'].astype(str)\n",
+    "df_plot = df_plot.sort_values(by=['strategy', 'dataset_size'], ascending=[True, False])\n",
+    "\n",
+    "# Create line plot\n",
+    "fig = px.line(\n",
+    "    df_plot,\n",
+    "    x=\"dataset_size\",\n",
+    "    y=\"total_loss\",\n",
+    "    color=\"strategy\",\n",
+    "    markers=True,\n",
+    "    text=\"parameter\",\n",
+    "    custom_data=[\"strategy\"],\n",
+    "    title=\"ECPS Regularization Strategy Comparison\",\n",
+    "    labels={\n",
+    "        \"dataset_size\": \"Number of Households\",\n",
+    "        \"total_loss\": \"Calibration Score\",\n",
+    "        \"strategy\": \"Regularization Approach\"\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "# Add text labels (parameter) on hover\n",
+    "fig.update_traces(\n",
+    "    textposition=\"top center\", \n",
+    "    hovertemplate=(\n",
+    "        \"Strategy: %{customdata[0]}<br>\"\n",
+    "        \"Size: %{x}<br>\"\n",
+    "        \"Loss: %{y:.4f}<br> \"\n",
+    "        \"Param: %{text}\"\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "# Add baseline lines\n",
+    "baseline = reg_results_df[reg_results_df['strategy'] == 'none'].iloc[0]\n",
+    "\n",
+    "fig.add_shape(\n",
+    "    type=\"line\",\n",
+    "    x0=baseline[\"dataset_size\"], x1=baseline[\"dataset_size\"],\n",
+    "    y0=df_plot[\"total_loss\"].min(), y1=df_plot[\"total_loss\"].max(),\n",
+    "    line=dict(color=\"gray\", dash=\"dash\"),\n",
+    "    name=\"Baseline Size\"\n",
+    ")\n",
+    "\n",
+    "fig.add_shape(\n",
+    "    type=\"line\",\n",
+    "    x0=df_plot[\"dataset_size\"].min(), x1=df_plot[\"dataset_size\"].max(),\n",
+    "    y0=baseline[\"total_loss\"], y1=baseline[\"total_loss\"],\n",
+    "    line=dict(color=\"gray\", dash=\"dash\"),\n",
+    "    name=\"Baseline Loss\"\n",
+    ")\n",
+    "\n",
+    "# Add annotation for the baseline\n",
+    "fig.add_annotation(\n",
+    "    x=baseline[\"dataset_size\"],\n",
+    "    y=baseline[\"total_loss\"],\n",
+    "    text=\"Baseline\",\n",
+    "    showarrow=True,\n",
+    "    arrowhead=1,\n",
+    "    ax=40,\n",
+    "    ay=-40,\n",
+    "    font=dict(color=\"gray\"),\n",
+    ")\n",
+    "\n",
+    "# Final layout adjustments\n",
+    "fig.update_layout(\n",
+    "    legend_title=\"Strategy\",\n",
+    "    hovermode=\"closest\",\n",
+    "    width=900,\n",
+    "    height=600\n",
+    ")\n",
+    "\n",
+    "fig.show()"
+   ]
   }
  ],
  "metadata": {

From b3208db3a85d703081874c17f80d0ae20e8a02a2 Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Tue, 15 Jul 2025 15:55:42 -0400
Subject: [PATCH 53/58] full test arena?? (trying it now)

---
 test_minimization_approach.ipynb | 147 ++++++++-----------------------
 1 file changed, 35 insertions(+), 112 deletions(-)

diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 9952ae4e..6c3921f0 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -2,19 +2,10 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
    "id": "d6dc9cca",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from policyengine_us_data.utils.minimize import minimize_dataset, random_sampling_minimization, candidate_loss_contribution\n",
     "from policyengine_us_data.storage import STORAGE_FOLDER\n",
@@ -226,65 +217,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "aeab67b3",
    "metadata": {},
    "outputs": [
     {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[6], line 31\u001b[0m\n\u001b[1;32m     29\u001b[0m         output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     30\u001b[0m         output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 31\u001b[0m         \u001b[43mminimize_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     32\u001b[0m \u001b[43m            \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     33\u001b[0m \u001b[43m            \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     34\u001b[0m \u001b[43m            \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m     35\u001b[0m \u001b[43m            \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m     36\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m     38\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:336\u001b[0m, in \u001b[0;36mminimize_dataset\u001b[0;34m(dataset, output_path, minimization_function, loss_matrix, targets, **kwargs)\u001b[0m\n\u001b[1;32m    333\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    334\u001b[0m     dataset_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(dataset)\n\u001b[0;32m--> 336\u001b[0m \u001b[43mcreate_calibration_log_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    338\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(dataset_path)\n\u001b[1;32m    339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m loss_matrix \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m targets \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:26\u001b[0m, in \u001b[0;36mcreate_calibration_log_file\u001b[0;34m(file_path, epoch)\u001b[0m\n\u001b[1;32m     23\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(file_path)\n\u001b[1;32m     24\u001b[0m sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39mdataset)\n\u001b[0;32m---> 26\u001b[0m loss_matrix, targets \u001b[38;5;241m=\u001b[39m \u001b[43mbuild_loss_matrix\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2024\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     28\u001b[0m bad_mask \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39misin(bad_targets)\n\u001b[1;32m     29\u001b[0m keep_mask_bool \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m~\u001b[39mbad_mask\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/loss.py:419\u001b[0m, in \u001b[0;36mbuild_loss_matrix\u001b[0;34m(dataset, time_period)\u001b[0m\n\u001b[1;32m    415\u001b[0m targets_array\u001b[38;5;241m.\u001b[39mappend(NET_WORTH_2024)\n\u001b[1;32m    417\u001b[0m \u001b[38;5;66;03m# SALT tax expenditure targeting\u001b[39;00m\n\u001b[0;32m--> 419\u001b[0m \u001b[43m_add_tax_expenditure_targets\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    420\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtime_period\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msim\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mloss_matrix\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtargets_array\u001b[49m\n\u001b[1;32m    421\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    423\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(loss_matrix\u001b[38;5;241m.\u001b[39misna()\u001b[38;5;241m.\u001b[39msum() \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m):\n\u001b[1;32m    424\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSome targets are missing from the loss matrix\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/loss.py:637\u001b[0m, in \u001b[0;36m_add_tax_expenditure_targets\u001b[0;34m(dataset, time_period, baseline_simulation, loss_matrix, targets_array)\u001b[0m\n\u001b[1;32m    634\u001b[0m simulation\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;241m=\u001b[39m time_period\n\u001b[1;32m    636\u001b[0m \u001b[38;5;66;03m# Calculate the baseline and reform income tax values.\u001b[39;00m\n\u001b[0;32m--> 637\u001b[0m income_tax_r \u001b[38;5;241m=\u001b[39m \u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    638\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mincome_tax\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhousehold\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m    639\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m    641\u001b[0m \u001b[38;5;66;03m# Compute the tax expenditure (TE) values.\u001b[39;00m\n\u001b[1;32m    642\u001b[0m te_values \u001b[38;5;241m=\u001b[39m income_tax_r \u001b[38;5;241m-\u001b[39m income_tax_b\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m   1003\u001b[0m     array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m   1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/income_tax.py:18\u001b[0m, in \u001b[0;36mincome_tax.formula\u001b[0;34m(person, period, parameters)\u001b[0m\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 18\u001b[0m     added_components \u001b[38;5;241m=\u001b[39m \u001b[43madd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[43m        \u001b[49m\u001b[43mperson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mincome_tax_before_refundable_credits\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m     20\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     21\u001b[0m     subtracted_components \u001b[38;5;241m=\u001b[39m add(\n\u001b[1;32m     22\u001b[0m         person, period, [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mincome_tax_refundable_credits\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m     23\u001b[0m     )\n\u001b[1;32m     24\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m added_components \u001b[38;5;241m-\u001b[39m subtracted_components\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:227\u001b[0m, in \u001b[0;36madd\u001b[0;34m(entity, period, variables, options)\u001b[0m\n\u001b[1;32m    207\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21madd\u001b[39m(\n\u001b[1;32m    208\u001b[0m     entity: Population,\n\u001b[1;32m    209\u001b[0m     period: Period,\n\u001b[1;32m    210\u001b[0m     variables: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m    211\u001b[0m     options: List[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    212\u001b[0m ):\n\u001b[1;32m    213\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Sums a list of variables.\u001b[39;00m\n\u001b[1;32m    214\u001b[0m \n\u001b[1;32m    215\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    225\u001b[0m \u001b[38;5;124;03m        ArrayLike: The result of the operation.\u001b[39;00m\n\u001b[1;32m    226\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfor_each_variable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    228\u001b[0m \u001b[43m        \u001b[49m\u001b[43mentity\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magg_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43madd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\n\u001b[1;32m    229\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:183\u001b[0m, in \u001b[0;36mfor_each_variable\u001b[0;34m(entity, period, variables, agg_func, group_agg_func, options)\u001b[0m\n\u001b[1;32m    181\u001b[0m variable_entity \u001b[38;5;241m=\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mget_variable(variable)\u001b[38;5;241m.\u001b[39mentity\n\u001b[1;32m    182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;241m==\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mkey:\n\u001b[0;32m--> 183\u001b[0m     values \u001b[38;5;241m=\u001b[39m \u001b[43mentity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mis_person:\n\u001b[1;32m    185\u001b[0m     values \u001b[38;5;241m=\u001b[39m group_agg_func(\n\u001b[1;32m    186\u001b[0m         entity\u001b[38;5;241m.\u001b[39mmembers(variable, period, options\u001b[38;5;241m=\u001b[39moptions)\n\u001b[1;32m    187\u001b[0m     )\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m    134\u001b[0m         variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m    135\u001b[0m     )\n\u001b[1;32m    136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m   1003\u001b[0m     array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m   1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/income_tax_before_refundable_credits.py:18\u001b[0m, in \u001b[0;36mincome_tax_before_refundable_credits.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m     17\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 18\u001b[0m     added_components \u001b[38;5;241m=\u001b[39m \u001b[43madd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtax_unit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     20\u001b[0m \u001b[43m        \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     21\u001b[0m \u001b[43m        \u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m     22\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mincome_tax_before_credits\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     23\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnet_investment_income_tax\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     24\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrecapture_of_investment_credit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     25\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43munreported_payroll_tax\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     26\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mqualified_retirement_penalty\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     27\u001b[0m \u001b[43m        \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     28\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     29\u001b[0m     subtracted_components \u001b[38;5;241m=\u001b[39m add(\n\u001b[1;32m     30\u001b[0m         tax_unit, period, [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mincome_tax_capped_non_refundable_credits\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m     31\u001b[0m     )\n\u001b[1;32m     32\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m added_components \u001b[38;5;241m-\u001b[39m subtracted_components\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:227\u001b[0m, in \u001b[0;36madd\u001b[0;34m(entity, period, variables, options)\u001b[0m\n\u001b[1;32m    207\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21madd\u001b[39m(\n\u001b[1;32m    208\u001b[0m     entity: Population,\n\u001b[1;32m    209\u001b[0m     period: Period,\n\u001b[1;32m    210\u001b[0m     variables: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m    211\u001b[0m     options: List[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    212\u001b[0m ):\n\u001b[1;32m    213\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Sums a list of variables.\u001b[39;00m\n\u001b[1;32m    214\u001b[0m \n\u001b[1;32m    215\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    225\u001b[0m \u001b[38;5;124;03m        ArrayLike: The result of the operation.\u001b[39;00m\n\u001b[1;32m    226\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfor_each_variable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    228\u001b[0m \u001b[43m        \u001b[49m\u001b[43mentity\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magg_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43madd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\n\u001b[1;32m    229\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:183\u001b[0m, in \u001b[0;36mfor_each_variable\u001b[0;34m(entity, period, variables, agg_func, group_agg_func, options)\u001b[0m\n\u001b[1;32m    181\u001b[0m variable_entity \u001b[38;5;241m=\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mget_variable(variable)\u001b[38;5;241m.\u001b[39mentity\n\u001b[1;32m    182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;241m==\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mkey:\n\u001b[0;32m--> 183\u001b[0m     values \u001b[38;5;241m=\u001b[39m \u001b[43mentity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mis_person:\n\u001b[1;32m    185\u001b[0m     values \u001b[38;5;241m=\u001b[39m group_agg_func(\n\u001b[1;32m    186\u001b[0m         entity\u001b[38;5;241m.\u001b[39mmembers(variable, period, options\u001b[38;5;241m=\u001b[39moptions)\n\u001b[1;32m    187\u001b[0m     )\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m    134\u001b[0m         variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m    135\u001b[0m     )\n\u001b[1;32m    136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:940\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m    938\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m adds_list:\n\u001b[1;32m    939\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mvariables:\n\u001b[0;32m--> 940\u001b[0m         values \u001b[38;5;241m=\u001b[39m values \u001b[38;5;241m+\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    941\u001b[0m \u001b[43m            \u001b[49m\u001b[43madded_variable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mentity\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkey\u001b[49m\n\u001b[1;32m    942\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    943\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    944\u001b[0m         \u001b[38;5;28;01mtry\u001b[39;00m:\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m   1003\u001b[0m     array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m   1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/alternative_minimum_tax/alternative_minimum_tax.py:24\u001b[0m, in \u001b[0;36malternative_minimum_tax.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m     21\u001b[0m amt_base_tax \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mamt_base_tax\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m     23\u001b[0m \u001b[38;5;66;03m# Tax on capital gains (Part III)\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m form_6251_part_iii_required \u001b[38;5;241m=\u001b[39m \u001b[43mtax_unit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mamt_part_iii_required\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     26\u001b[0m amt_tax_including_cg \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mamt_tax_including_cg\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m     27\u001b[0m smaller_tax \u001b[38;5;241m=\u001b[39m min_(amt_base_tax, amt_tax_including_cg)\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m    134\u001b[0m         variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m    135\u001b[0m     )\n\u001b[1;32m    136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m   1003\u001b[0m     array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m   1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/alternative_minimum_tax/amt_part_iii_required.py:13\u001b[0m, in \u001b[0;36mamt_part_iii_required.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mformula\u001b[39m(tax_unit, period, parameters):\n\u001b[0;32m---> 13\u001b[0m     relevant_inputs \u001b[38;5;241m=\u001b[39m \u001b[43madd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     14\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtax_unit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     15\u001b[0m \u001b[43m        \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     16\u001b[0m \u001b[43m        \u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m     17\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks10\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     18\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks13\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks14\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     20\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks19\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     21\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43munrecaptured_section_1250_gain\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     22\u001b[0m \u001b[43m        \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     23\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     24\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m relevant_inputs \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:227\u001b[0m, in \u001b[0;36madd\u001b[0;34m(entity, period, variables, options)\u001b[0m\n\u001b[1;32m    207\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21madd\u001b[39m(\n\u001b[1;32m    208\u001b[0m     entity: Population,\n\u001b[1;32m    209\u001b[0m     period: Period,\n\u001b[1;32m    210\u001b[0m     variables: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m    211\u001b[0m     options: List[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    212\u001b[0m ):\n\u001b[1;32m    213\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Sums a list of variables.\u001b[39;00m\n\u001b[1;32m    214\u001b[0m \n\u001b[1;32m    215\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    225\u001b[0m \u001b[38;5;124;03m        ArrayLike: The result of the operation.\u001b[39;00m\n\u001b[1;32m    226\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfor_each_variable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    228\u001b[0m \u001b[43m        \u001b[49m\u001b[43mentity\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magg_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43madd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\n\u001b[1;32m    229\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:183\u001b[0m, in \u001b[0;36mfor_each_variable\u001b[0;34m(entity, period, variables, agg_func, group_agg_func, options)\u001b[0m\n\u001b[1;32m    181\u001b[0m variable_entity \u001b[38;5;241m=\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mget_variable(variable)\u001b[38;5;241m.\u001b[39mentity\n\u001b[1;32m    182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;241m==\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mkey:\n\u001b[0;32m--> 183\u001b[0m     values \u001b[38;5;241m=\u001b[39m \u001b[43mentity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mis_person:\n\u001b[1;32m    185\u001b[0m     values \u001b[38;5;241m=\u001b[39m group_agg_func(\n\u001b[1;32m    186\u001b[0m         entity\u001b[38;5;241m.\u001b[39mmembers(variable, period, options\u001b[38;5;241m=\u001b[39moptions)\n\u001b[1;32m    187\u001b[0m     )\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m    134\u001b[0m         variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m    135\u001b[0m     )\n\u001b[1;32m    136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:612\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    610\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVariable \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvariable_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not exist.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    611\u001b[0m population \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_variable_population(variable_name)\n\u001b[0;32m--> 612\u001b[0m holder \u001b[38;5;241m=\u001b[39m \u001b[43mpopulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_holder\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    613\u001b[0m variable \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mget_variable(\n\u001b[1;32m    614\u001b[0m     variable_name, check_existence\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m    615\u001b[0m )\n\u001b[1;32m    617\u001b[0m \u001b[38;5;66;03m# Check if we've neutralized via parameters.\u001b[39;00m\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:145\u001b[0m, in \u001b[0;36mPopulation.get_holder\u001b[0;34m(self, variable_name)\u001b[0m\n\u001b[1;32m    143\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mget_holder\u001b[39m(\u001b[38;5;28mself\u001b[39m, variable_name: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Holder:\n\u001b[1;32m    144\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mcheck_variable_defined_for_entity(variable_name)\n\u001b[0;32m--> 145\u001b[0m     holder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_holders\u001b[38;5;241m.\u001b[39mget(variable_name)\n\u001b[1;32m    146\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m holder:\n\u001b[1;32m    147\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m holder\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 91%|█████████ | 228/250 [01:44<00:21,  1.04it/s, loss=3.27e-5, loss_rel_change=-0.384]"
      ]
     }
    ],
@@ -337,25 +278,6 @@
     "                    )"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "fa1ea957",
-   "metadata": {},
-   "source": [
-    "### (Temporary) Cleaning of data (removing weights smaller than epsilon)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e88df261",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## this should go in the enhanced_cps_2024.py file, because household removal doesn't happen there\n",
-    "# Need to check Ben's PR."
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "f8b0fe2e",
@@ -463,7 +385,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": null,
    "id": "7bb3ef3c",
    "metadata": {},
    "outputs": [
@@ -710,13 +632,9 @@
    ],
    "source": [
     "\"\"\"\n",
-    "Pulling values from created calibration_log.csv and .h5 files to populate the line plot dataframe\n",
-    "\n",
-    "( I need to pull the strategy (folder name), parameter (from file title??), dataset size (from length of .h5 file), and total loss (from sum of loss column in calibration_log_file.csv))\n",
-    "\n",
-    "approaches = [\"l0_exp\", \"l1\"] \n",
-    "penalty_weights = [1e-2, 1e-1]\n",
+    "Scraping values from created calibration_log.csv and .h5 files to populate the plotting dataframe\n",
     "\"\"\"\n",
+    "\n",
     "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
     "penalty_weights = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]\n",
     "og_size = 41310  # Original size of the dataset\n",
@@ -731,11 +649,9 @@
     "        # Pull length of .h5 file\n",
     "        h5_name = f\"enhanced_cps_2024_{strategy}_{parameter}_minimised.h5\"\n",
     "        h5_path = get_output_path(strategy, h5_name)\n",
-    "        # see if this works\n",
     "        dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n",
-    "        #total_size.append(dataset_size)\n",
     "\n",
-    "        # Pull sum of loss column\n",
+    "        # Pull score of loss column\n",
     "        cal_log_name = f\"calibration_log_{strategy}_{parameter}.csv\"\n",
     "        cal_log_path = get_output_path(strategy, cal_log_name)\n",
     "        calibration_log = pd.read_csv(cal_log_path)\n",
@@ -745,27 +661,30 @@
     "\n",
     "\n",
     "\n",
-    "'''\n",
-    "\n",
-    "fraction = [0.5, 0.6, 0.7, 0.8, 0.9]\n",
+    "approaches = {\n",
+    "    \"random_sampling_minimization\":[0.5, 0.6, 0.7, 0.8, 0.9], \n",
+    "    \"candidate_loss_contribution\": [0.001, 0.0001, 0.00001, 0.000001, 0.0000001],\n",
+    "}\n",
     "\n",
-    "for fraction in fraction:\n",
-    "    strategy = \"random_sampling_minimization\"\n",
-    "    parameter = fraction\n",
+    "for approach, fractions in approaches.items():  # Use .items() to get key-value pairs\n",
+    "    for fraction in fractions:\n",
+    "        strategy = approach\n",
+    "        parameter = fraction\n",
     "\n",
-    "    # Pull length of .h5 file\n",
-    "    h5_name = f\"{fraction}_enhanced_cps_2024_random_sampling_minimization_minimised.h5\"\n",
-    "    h5_path = STORAGE_FOLDER / strategy / h5_name\n",
-    "    dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n",
+    "        # Pull length of .h5 file\n",
+    "        h5_name = f\"{fraction}_enhanced_cps_2024_{approach}_minimised.h5\"\n",
+    "        h5_path = STORAGE_FOLDER / strategy / h5_name\n",
+    "        dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n",
     "\n",
-    "    # Pull sum of loss column\n",
-    "    cal_log_name = f\"{fraction}_enhanced_cps_2024_random_sampling_minimization_minimised_calibration_log.csv\"\n",
-    "    cal_log_path = STORAGE_FOLDER / strategy / cal_log_name\n",
-    "    total_loss = pd.read_csv(cal_log_path)['loss'].sum()\n",
+    "        # Pull sum of loss column\n",
+    "        cal_log_name = f\"{fraction}_enhanced_cps_2024_{approach}_minimised_calibration_log.csv\"\n",
+    "        cal_log_path = STORAGE_FOLDER / strategy / cal_log_name\n",
+    "        cal_log_path = get_output_path(strategy, cal_log_name)\n",
+    "        calibration_log = pd.read_csv(cal_log_path)\n",
+    "        loss_value = loss_score(calibration_log)\n",
     "\n",
-    "    add_result(df, strategy, parameter, dataset_size, total_loss)\n",
+    "        reg_results_df = add_result(reg_results_df, strategy, parameter, dataset_size, loss_value)\n",
     "\n",
-    "'''\n",
     "reg_results_df\n"
    ]
   },
@@ -869,7 +788,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "2dc0891c",
    "metadata": {},
    "outputs": [
@@ -2030,6 +1949,10 @@
     }
    ],
    "source": [
+    "\"\"\"\n",
+    "Creating a multi-line plot with plotly\n",
+    "\"\"\"\n",
+    "\n",
     "# Filter out the baseline row\n",
     "df_plot = reg_results_df[reg_results_df['strategy'] != 'none'].copy()\n",
     "df_plot['parameter'] = df_plot['parameter'].astype(str)\n",

From 4d8f60c29dbbab6ba087259aad08f8a724896abf Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Tue, 15 Jul 2025 15:56:21 -0400
Subject: [PATCH 54/58] forgot a file

---
 policyengine_us_data/datasets/cps/enhanced_cps.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 3da4f571..59abeafa 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -39,7 +39,7 @@ def reweight(
     loss_matrix,
     targets_array,
     dropout_rate=0.05,
-    epochs=500,
+    epochs=250,
     log_path="calibration_log.csv",
     penalty_approach=None,
     penalty_weight=None,
@@ -270,7 +270,7 @@ def generate(self):
                 loss_matrix_clean,
                 targets_array_clean,
                 log_path="calibration_log.csv",
-                epochs=150,
+                epochs=250,
             )
             data["household_weight"][year] = optimised_weights
 

From 112658f781b2fbf39d9cd1936a009478dab9a5a7 Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Tue, 15 Jul 2025 16:00:24 -0400
Subject: [PATCH 55/58] added some headers, just need to add pruning

---
 test_minimization_approach.ipynb | 45 ++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 6c3921f0..2e0ff269 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -1,5 +1,21 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "037988b0",
+   "metadata": {},
+   "source": [
+    "# Testing Arena for Different Regularization Strategies"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "268ab898",
+   "metadata": {},
+   "source": [
+    "#### Imports"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,
@@ -52,7 +68,7 @@
    "id": "e99994d3",
    "metadata": {},
    "source": [
-    "# Enhanced_CPS_2024.py Approaches"
+    "## Enhanced_CPS_2024.py Regularization Approaches"
    ]
   },
   {
@@ -212,7 +228,7 @@
    "id": "69ff392d",
    "metadata": {},
    "source": [
-    "# Minimize.py approaches"
+    "## Minimize.py Regularization Approaches"
    ]
   },
   {
@@ -225,7 +241,17 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      " 91%|█████████ | 228/250 [01:44<00:21,  1.04it/s, loss=3.27e-5, loss_rel_change=-0.384]"
+      "100%|██████████| 250/250 [01:59<00:00,  2.08it/s, loss=3.47e-5, loss_rel_change=-0.347]\n",
+      "100%|██████████| 250/250 [01:43<00:00,  2.41it/s, loss=3.27e-5, loss_rel_change=-0.407]\n",
+      "100%|██████████| 250/250 [02:00<00:00,  2.08it/s, loss=3.22e-5, loss_rel_change=-0.368]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Weight relative change: 99.95%\n",
+      "Re-calibrating final selected households...\n"
      ]
     }
    ],
@@ -278,12 +304,21 @@
     "                    )"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "8568b5ca",
+   "metadata": {},
+   "source": [
+    "## Visualization of Results\n",
+    "Calibration logs can also be shown in María's Vercel dashboard"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "f8b0fe2e",
    "metadata": {},
    "source": [
-    "### Visualization"
+    "### Data Scrape for Plotting"
    ]
   },
   {
@@ -693,7 +728,7 @@
    "id": "5b203ccd",
    "metadata": {},
    "source": [
-    "## Plotting"
+    "### Plotting"
    ]
   },
   {

From fa9aa02486ebb54b0466fbbe553671c494eacd4d Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Wed, 16 Jul 2025 10:29:47 -0400
Subject: [PATCH 56/58] fixed a scraping bug & deleted synthetic data

---
 test_minimization_approach.ipynb | 456 ++++++++++++-------------------
 1 file changed, 168 insertions(+), 288 deletions(-)

diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 2e0ff269..e9f8eb69 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -18,15 +18,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 12,
    "id": "d6dc9cca",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "cannot import name 'prune_dataset' from 'policyengine_us_data.datasets.cps.enhanced_cps' (/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[12], line 4\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us_data\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mstorage\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m STORAGE_FOLDER\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Microsimulation\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us_data\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcps\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01menhanced_cps\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m reweight, prune_dataset, ExtendedCPS_2024\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us_data\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m build_loss_matrix\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n",
+      "\u001b[0;31mImportError\u001b[0m: cannot import name 'prune_dataset' from 'policyengine_us_data.datasets.cps.enhanced_cps' (/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py)"
+     ]
+    }
+   ],
    "source": [
     "from policyengine_us_data.utils.minimize import minimize_dataset, random_sampling_minimization, candidate_loss_contribution\n",
     "from policyengine_us_data.storage import STORAGE_FOLDER\n",
     "from policyengine_us import Microsimulation\n",
-    "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
+    "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, prune_dataset, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
     "import numpy as np\n",
     "import os\n",
@@ -214,7 +226,10 @@
     "                        penalty_weight=penalty_weight, \n",
     "                        epochs=250,  # Reduced epochs for faster processing\n",
     "                    )\n",
-    "                    data[\"household_weight\"][year] = optimised_weights\n",
+    "                    keep_indices = prune_dataset(optimised_weights, epsilon=1e-3, method=\"threshold\")\n",
+    "                    pruned_weights = optimised_weights[keep_indices]\n",
+    "                    \n",
+    "                    data[\"household_weight\"][year] = pruned_weights\n",
     "\n",
     "                # Save to HDF5 file\n",
     "                with h5py.File(h5_path, \"w\") as f:\n",
@@ -233,7 +248,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "aeab67b3",
    "metadata": {},
    "outputs": [
@@ -253,6 +268,117 @@
       "Weight relative change: 99.95%\n",
       "Re-calibrating final selected households...\n"
      ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 250/250 [01:32<00:00,  2.70it/s, loss=3.35e-5, loss_rel_change=-0.359]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final calibration completed successfully\n",
+      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/random_sampling_minimization/0.5_enhanced_cps_2024_minimised.h5\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 250/250 [01:45<00:00,  2.38it/s, loss=3.46e-5, loss_rel_change=-0.318]\n",
+      "100%|██████████| 250/250 [01:42<00:00,  2.44it/s, loss=3.11e-5, loss_rel_change=-0.395]\n",
+      "100%|██████████| 250/250 [01:46<00:00,  2.35it/s, loss=3.08e-5, loss_rel_change=-0.405]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Weight relative change: 99.99%\n",
+      "Re-calibrating final selected households...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 250/250 [02:18<00:00,  1.80it/s, loss=3.14e-5, loss_rel_change=-0.385]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final calibration completed successfully\n",
+      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/random_sampling_minimization/0.6_enhanced_cps_2024_minimised.h5\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 250/250 [02:36<00:00,  1.60it/s, loss=3.29e-5, loss_rel_change=-0.343]\n",
+      "100%|██████████| 250/250 [3:02:18<00:00, 43.76s/it, loss=3.43e-5, loss_rel_change=-0.578]     \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration 1: Loss changed from 3.668773852244141e-08 to 3.9001762470775345e-08, which is too high (6.31%). Stopping.\n",
+      "Weight relative change: 100.00%\n",
+      "Re-calibrating final selected households...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 250/250 [02:07<00:00,  1.95it/s, loss=3.23e-5, loss_rel_change=-0.364]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final calibration completed successfully\n",
+      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/0.001_enhanced_cps_2024_minimised.h5\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 250/250 [02:10<00:00,  1.92it/s, loss=3.19e-5, loss_rel_change=-0.372]\n",
+      "100%|██████████| 250/250 [02:07<00:00,  1.96it/s, loss=3.58e-5, loss_rel_change=-0.556]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration 1: Loss changed from 3.708600229852418e-08 to 3.936675423208132e-08, which is too high (6.15%). Stopping.\n",
+      "Weight relative change: 100.00%\n",
+      "Re-calibrating final selected households...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 250/250 [02:20<00:00,  1.78it/s, loss=3.22e-5, loss_rel_change=-0.38] \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final calibration completed successfully\n",
+      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/0.0001_enhanced_cps_2024_minimised.h5\n"
+     ]
     }
    ],
    "source": [
@@ -323,7 +449,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 13,
    "id": "225debd8",
    "metadata": {},
    "outputs": [
@@ -371,7 +497,7 @@
        "0     none      none         41310      0.0069"
       ]
      },
-     "execution_count": 62,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -382,6 +508,12 @@
     "Creating dataframe to store regularization results\n",
     "\"\"\"\n",
     "\n",
+    "\n",
+    "def get_output_path(approach, file_name):\n",
+    "    output_path = STORAGE_FOLDER / approach / file_name\n",
+    "    output_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "    return output_path\n",
+    "\n",
     "# Calculate quality categories\n",
     "def loss_score(calibration_log):\n",
     "    excellent_count = (\n",
@@ -420,7 +552,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "id": "7bb3ef3c",
    "metadata": {},
    "outputs": [
@@ -457,210 +589,50 @@
        "      <td>none</td>\n",
        "      <td>none</td>\n",
        "      <td>41310</td>\n",
-       "      <td>0.0069</td>\n",
+       "      <td>0.006900</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>l0_sigmoid</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>0.0069</td>\n",
+       "      <td>random_sampling_minimization</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>20655</td>\n",
+       "      <td>80.882353</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>l0_sigmoid</td>\n",
-       "      <td>0.1</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
+       "      <td>random_sampling_minimization</td>\n",
+       "      <td>0.6</td>\n",
+       "      <td>24786</td>\n",
+       "      <td>80.882353</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>l0_sigmoid</td>\n",
-       "      <td>0.01</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>l0_sigmoid</td>\n",
-       "      <td>0.001</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>l0_sigmoid</td>\n",
-       "      <td>0.0001</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>l0_sigmoid</td>\n",
-       "      <td>0.00001</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>l0_log</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>0.0069</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>l0_log</td>\n",
-       "      <td>0.1</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>l0_log</td>\n",
-       "      <td>0.01</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>l0_log</td>\n",
-       "      <td>0.001</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>l0_log</td>\n",
-       "      <td>0.0001</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>l0_log</td>\n",
-       "      <td>0.00001</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>l0_exp</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>0.0069</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>l0_exp</td>\n",
-       "      <td>0.1</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>l0_exp</td>\n",
-       "      <td>0.01</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>l0_exp</td>\n",
-       "      <td>0.001</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>l0_exp</td>\n",
-       "      <td>0.0001</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>l0_exp</td>\n",
-       "      <td>0.00001</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19</th>\n",
-       "      <td>l1</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>0.0069</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20</th>\n",
-       "      <td>l1</td>\n",
-       "      <td>0.1</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>l1</td>\n",
-       "      <td>0.01</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>22</th>\n",
-       "      <td>l1</td>\n",
+       "      <td>candidate_loss_contribution</td>\n",
        "      <td>0.001</td>\n",
        "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
+       "      <td>80.882353</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>23</th>\n",
-       "      <td>l1</td>\n",
+       "      <th>4</th>\n",
+       "      <td>candidate_loss_contribution</td>\n",
        "      <td>0.0001</td>\n",
        "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>24</th>\n",
-       "      <td>l1</td>\n",
-       "      <td>0.00001</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>39.2959</td>\n",
+       "      <td>80.882353</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "      strategy parameter  dataset_size  total_loss\n",
-       "0         none      none         41310      0.0069\n",
-       "1   l0_sigmoid       1.0         41310      0.0069\n",
-       "2   l0_sigmoid       0.1         41310     39.2959\n",
-       "3   l0_sigmoid      0.01         41310     39.2959\n",
-       "4   l0_sigmoid     0.001         41310     39.2959\n",
-       "5   l0_sigmoid    0.0001         41310     39.2959\n",
-       "6   l0_sigmoid   0.00001         41310     39.2959\n",
-       "7       l0_log       1.0         41310      0.0069\n",
-       "8       l0_log       0.1         41310     39.2959\n",
-       "9       l0_log      0.01         41310     39.2959\n",
-       "10      l0_log     0.001         41310     39.2959\n",
-       "11      l0_log    0.0001         41310     39.2959\n",
-       "12      l0_log   0.00001         41310     39.2959\n",
-       "13      l0_exp       1.0         41310      0.0069\n",
-       "14      l0_exp       0.1         41310     39.2959\n",
-       "15      l0_exp      0.01         41310     39.2959\n",
-       "16      l0_exp     0.001         41310     39.2959\n",
-       "17      l0_exp    0.0001         41310     39.2959\n",
-       "18      l0_exp   0.00001         41310     39.2959\n",
-       "19          l1       1.0         41310      0.0069\n",
-       "20          l1       0.1         41310     39.2959\n",
-       "21          l1      0.01         41310     39.2959\n",
-       "22          l1     0.001         41310     39.2959\n",
-       "23          l1    0.0001         41310     39.2959\n",
-       "24          l1   0.00001         41310     39.2959"
+       "                       strategy parameter  dataset_size  total_loss\n",
+       "0                          none      none         41310    0.006900\n",
+       "1  random_sampling_minimization       0.5         20655   80.882353\n",
+       "2  random_sampling_minimization       0.6         24786   80.882353\n",
+       "3   candidate_loss_contribution     0.001         41310   80.882353\n",
+       "4   candidate_loss_contribution    0.0001         41310   80.882353"
       ]
      },
-     "execution_count": 63,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -669,7 +641,7 @@
     "\"\"\"\n",
     "Scraping values from created calibration_log.csv and .h5 files to populate the plotting dataframe\n",
     "\"\"\"\n",
-    "\n",
+    "'''\n",
     "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
     "penalty_weights = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]\n",
     "og_size = 41310  # Original size of the dataset\n",
@@ -693,12 +665,11 @@
     "        loss_value = loss_score(calibration_log)\n",
     "        \n",
     "        reg_results_df = add_result(reg_results_df, strategy, parameter, dataset_size, loss_value)\n",
-    "\n",
-    "\n",
+    "'''\n",
     "\n",
     "approaches = {\n",
-    "    \"random_sampling_minimization\":[0.5, 0.6, 0.7, 0.8, 0.9], \n",
-    "    \"candidate_loss_contribution\": [0.001, 0.0001, 0.00001, 0.000001, 0.0000001],\n",
+    "    \"random_sampling_minimization\":[0.5, 0.6], #, 0.7, 0.8, 0.9], \n",
+    "    \"candidate_loss_contribution\": [0.001, 0.0001] #, 0.00001, 0.000001, 0.0000001],\n",
     "}\n",
     "\n",
     "for approach, fractions in approaches.items():  # Use .items() to get key-value pairs\n",
@@ -707,14 +678,13 @@
     "        parameter = fraction\n",
     "\n",
     "        # Pull length of .h5 file\n",
-    "        h5_name = f\"{fraction}_enhanced_cps_2024_{approach}_minimised.h5\"\n",
+    "        h5_name = f\"{fraction}_enhanced_cps_2024_minimised.h5\"\n",
     "        h5_path = STORAGE_FOLDER / strategy / h5_name\n",
     "        dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n",
     "\n",
     "        # Pull sum of loss column\n",
-    "        cal_log_name = f\"{fraction}_enhanced_cps_2024_{approach}_minimised_calibration_log.csv\"\n",
-    "        cal_log_path = STORAGE_FOLDER / strategy / cal_log_name\n",
-    "        cal_log_path = get_output_path(strategy, cal_log_name)\n",
+    "        cal_log_name = f\"{fraction}_enhanced_cps_2024_minimised_calibration_log.csv\"\n",
+    "        cal_log_name = get_output_path(strategy, cal_log_name)\n",
     "        calibration_log = pd.read_csv(cal_log_path)\n",
     "        loss_value = loss_score(calibration_log)\n",
     "\n",
@@ -731,96 +701,6 @@
     "### Plotting"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "9602953a",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "      strategy parameter  dataset_size  total_loss\n",
-      "0         none      none         41310      0.0069\n",
-      "1   l0_sigmoid       1.0         41310      0.0069\n",
-      "2   l0_sigmoid       0.1         37048      5.8041\n",
-      "3   l0_sigmoid      0.01         32786     13.3255\n",
-      "4   l0_sigmoid     0.001         28524     21.6723\n",
-      "5   l0_sigmoid    0.0001         24262     30.6049\n",
-      "6   l0_sigmoid   0.00001         20000     40.0000\n",
-      "7       l0_log       1.0         41310      0.0069\n",
-      "8       l0_log       0.1         37048      8.7028\n",
-      "9       l0_log      0.01         32786     19.9847\n",
-      "10      l0_log     0.001         28524     32.5050\n",
-      "11      l0_log    0.0001         24262     45.9039\n",
-      "12      l0_log   0.00001         20000     59.9965\n",
-      "13      l0_exp       1.0         41310      0.0069\n",
-      "14      l0_exp       0.1         37048     11.6014\n",
-      "15      l0_exp      0.01         32786     26.6440\n",
-      "16      l0_exp     0.001         28524     43.3377\n",
-      "17      l0_exp    0.0001         24262     61.2029\n",
-      "18      l0_exp   0.00001         20000     79.9931\n",
-      "19          l1       1.0         41310      0.0069\n",
-      "20          l1       0.1         37048     14.5000\n",
-      "21          l1      0.01         32786     33.3033\n",
-      "22          l1     0.001         28524     54.1704\n",
-      "23          l1    0.0001         24262     76.5019\n",
-      "24          l1   0.00001         20000     99.9896\n"
-     ]
-    }
-   ],
-   "source": [
-    "'''\n",
-    "Synthetic dataset\n",
-    "'''\n",
-    "\n",
-    "# Define values\n",
-    "strategies = ['l0_sigmoid', 'l0_log', 'l0_exp', 'l1']\n",
-    "parameters = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001]\n",
-    "\n",
-    "# Synthetic values\n",
-    "base_size = 41310\n",
-    "min_size = 20000\n",
-    "base_loss = 0.0069\n",
-    "max_loss = 40.0\n",
-    "\n",
-    "strategy_slopes = {\n",
-    "    'l0_sigmoid': 1.0,\n",
-    "    'l0_log': 1.5,\n",
-    "    'l0_exp': 2.0,\n",
-    "    'l1': 2.5,\n",
-    "}\n",
-    "\n",
-    "rows = [{'strategy': 'none', 'parameter': 'none', 'dataset_size': base_size, 'total_loss': base_loss}]\n",
-    "\n",
-    "for strategy in strategies:\n",
-    "    slope = strategy_slopes[strategy]\n",
-    "    \n",
-    "    for i, param in enumerate(parameters):\n",
-    "        # Normalized compression level: 0 (no compression) to 1 (max compression)\n",
-    "        compression_level = i / (len(parameters) - 1)\n",
-    "        \n",
-    "        # Size shrinks linearly\n",
-    "        size = int(base_size - (base_size - min_size) * compression_level)\n",
-    "        \n",
-    "        # Loss increases quadratically (or linearly) based on strategy slope\n",
-    "        loss = round(base_loss + slope * (max_loss - base_loss) * (compression_level ** 1.2), 4)\n",
-    "        \n",
-    "        rows.append({\n",
-    "            'strategy': strategy,\n",
-    "            'parameter': param,\n",
-    "            'dataset_size': size,\n",
-    "            'total_loss': loss\n",
-    "        })\n",
-    "\n",
-    "# Create DataFrame\n",
-    "reg_results_df = pd.DataFrame(rows)\n",
-    "\n",
-    "# Display\n",
-    "print(reg_results_df)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,

From 791f0d964a49ef8a52545c82641f8c4ccccbca71 Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Wed, 16 Jul 2025 10:50:05 -0400
Subject: [PATCH 57/58] fixed a scraping bug

---
 test_minimization_approach.ipynb | 277 ++++++-------------------------
 1 file changed, 52 insertions(+), 225 deletions(-)

diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index e9f8eb69..4fcb8b91 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -18,27 +18,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 16,
    "id": "d6dc9cca",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "ImportError",
-     "evalue": "cannot import name 'prune_dataset' from 'policyengine_us_data.datasets.cps.enhanced_cps' (/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[12], line 4\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us_data\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mstorage\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m STORAGE_FOLDER\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Microsimulation\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us_data\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcps\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01menhanced_cps\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m reweight, prune_dataset, ExtendedCPS_2024\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us_data\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m build_loss_matrix\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n",
-      "\u001b[0;31mImportError\u001b[0m: cannot import name 'prune_dataset' from 'policyengine_us_data.datasets.cps.enhanced_cps' (/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py)"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from policyengine_us_data.utils.minimize import minimize_dataset, random_sampling_minimization, candidate_loss_contribution\n",
     "from policyengine_us_data.storage import STORAGE_FOLDER\n",
     "from policyengine_us import Microsimulation\n",
-    "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, prune_dataset, ExtendedCPS_2024\n",
+    "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n",
     "from policyengine_us_data.utils import build_loss_matrix\n",
     "import numpy as np\n",
     "import os\n",
@@ -248,136 +236,44 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 17,
    "id": "aeab67b3",
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 250/250 [01:59<00:00,  2.08it/s, loss=3.47e-5, loss_rel_change=-0.347]\n",
-      "100%|██████████| 250/250 [01:43<00:00,  2.41it/s, loss=3.27e-5, loss_rel_change=-0.407]\n",
-      "100%|██████████| 250/250 [02:00<00:00,  2.08it/s, loss=3.22e-5, loss_rel_change=-0.368]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Weight relative change: 99.95%\n",
-      "Re-calibrating final selected households...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 250/250 [01:32<00:00,  2.70it/s, loss=3.35e-5, loss_rel_change=-0.359]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Final calibration completed successfully\n",
-      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/random_sampling_minimization/0.5_enhanced_cps_2024_minimised.h5\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 250/250 [01:45<00:00,  2.38it/s, loss=3.46e-5, loss_rel_change=-0.318]\n",
-      "100%|██████████| 250/250 [01:42<00:00,  2.44it/s, loss=3.11e-5, loss_rel_change=-0.395]\n",
-      "100%|██████████| 250/250 [01:46<00:00,  2.35it/s, loss=3.08e-5, loss_rel_change=-0.405]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Weight relative change: 99.99%\n",
-      "Re-calibrating final selected households...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 250/250 [02:18<00:00,  1.80it/s, loss=3.14e-5, loss_rel_change=-0.385]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Final calibration completed successfully\n",
-      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/random_sampling_minimization/0.6_enhanced_cps_2024_minimised.h5\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 250/250 [02:36<00:00,  1.60it/s, loss=3.29e-5, loss_rel_change=-0.343]\n",
-      "100%|██████████| 250/250 [3:02:18<00:00, 43.76s/it, loss=3.43e-5, loss_rel_change=-0.578]     \n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Iteration 1: Loss changed from 3.668773852244141e-08 to 3.9001762470775345e-08, which is too high (6.31%). Stopping.\n",
-      "Weight relative change: 100.00%\n",
-      "Re-calibrating final selected households...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 250/250 [02:07<00:00,  1.95it/s, loss=3.23e-5, loss_rel_change=-0.364]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Final calibration completed successfully\n",
-      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/0.001_enhanced_cps_2024_minimised.h5\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 250/250 [02:10<00:00,  1.92it/s, loss=3.19e-5, loss_rel_change=-0.372]\n",
-      "100%|██████████| 250/250 [02:07<00:00,  1.96it/s, loss=3.58e-5, loss_rel_change=-0.556]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Iteration 1: Loss changed from 3.708600229852418e-08 to 3.936675423208132e-08, which is too high (6.15%). Stopping.\n",
-      "Weight relative change: 100.00%\n",
-      "Re-calibrating final selected households...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 250/250 [02:20<00:00,  1.78it/s, loss=3.22e-5, loss_rel_change=-0.38] \n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Final calibration completed successfully\n",
-      "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/0.0001_enhanced_cps_2024_minimised.h5\n"
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[17], line 31\u001b[0m\n\u001b[1;32m     29\u001b[0m         output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     30\u001b[0m         output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 31\u001b[0m         \u001b[43mminimize_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     32\u001b[0m \u001b[43m            \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     33\u001b[0m \u001b[43m            \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     34\u001b[0m \u001b[43m            \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m     35\u001b[0m \u001b[43m            \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m     36\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m     38\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:336\u001b[0m, in \u001b[0;36mminimize_dataset\u001b[0;34m(dataset, output_path, minimization_function, loss_matrix, targets, **kwargs)\u001b[0m\n\u001b[1;32m    333\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    334\u001b[0m     dataset_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(dataset)\n\u001b[0;32m--> 336\u001b[0m \u001b[43mcreate_calibration_log_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    338\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(dataset_path)\n\u001b[1;32m    339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m loss_matrix \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m targets \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:26\u001b[0m, in \u001b[0;36mcreate_calibration_log_file\u001b[0;34m(file_path, epoch)\u001b[0m\n\u001b[1;32m     23\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(file_path)\n\u001b[1;32m     24\u001b[0m sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39mdataset)\n\u001b[0;32m---> 26\u001b[0m loss_matrix, targets \u001b[38;5;241m=\u001b[39m \u001b[43mbuild_loss_matrix\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2024\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     28\u001b[0m bad_mask \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39misin(bad_targets)\n\u001b[1;32m     29\u001b[0m keep_mask_bool \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m~\u001b[39mbad_mask\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/loss.py:243\u001b[0m, in \u001b[0;36mbuild_loss_matrix\u001b[0;34m(dataset, time_period)\u001b[0m\n\u001b[1;32m    241\u001b[0m \u001b[38;5;66;03m# National ACA Spending\u001b[39;00m\n\u001b[1;32m    242\u001b[0m label \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnation/gov/aca_spending\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 243\u001b[0m loss_matrix[label] \u001b[38;5;241m=\u001b[39m \u001b[43msim\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    244\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maca_ptc\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhousehold\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2025\u001b[39;49m\n\u001b[1;32m    245\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m    246\u001b[0m ACA_SPENDING_2024 \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m9.8e10\u001b[39m  \u001b[38;5;66;03m# 2024 outlays on PTC\u001b[39;00m\n\u001b[1;32m    247\u001b[0m targets_array\u001b[38;5;241m.\u001b[39mappend(ACA_SPENDING_2024)\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m   1003\u001b[0m     array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m   1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/aca/ptc/aca_ptc.py:14\u001b[0m, in \u001b[0;36maca_ptc.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mformula\u001b[39m(tax_unit, period, parameters):\n\u001b[0;32m---> 14\u001b[0m     plan_cost \u001b[38;5;241m=\u001b[39m \u001b[43mtax_unit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mslcsp\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     15\u001b[0m     income \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maca_magi\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m     16\u001b[0m     applicable_figure \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maca_ptc_phase_out_rate\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m    134\u001b[0m         variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m    135\u001b[0m     )\n\u001b[1;32m    136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:681\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    679\u001b[0m         values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_calculate(variable_name, contained_months[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m])\n\u001b[1;32m    680\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 681\u001b[0m         values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate_add\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    682\u001b[0m     alternate_period_handling \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m    683\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable\u001b[38;5;241m.\u001b[39mdefinition_period \u001b[38;5;241m==\u001b[39m YEAR \u001b[38;5;129;01mand\u001b[39;00m period\u001b[38;5;241m.\u001b[39munit \u001b[38;5;241m==\u001b[39m MONTH:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:67\u001b[0m, in \u001b[0;36mMicrosimulation.calculate_add\u001b[0;34m(self, variable_name, period, map_to, use_weights)\u001b[0m\n\u001b[1;32m     60\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mcalculate_add\u001b[39m(\n\u001b[1;32m     61\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m     62\u001b[0m     variable_name: \u001b[38;5;28mstr\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     65\u001b[0m     use_weights: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m     66\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m MicroSeries:\n\u001b[0;32m---> 67\u001b[0m     values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate_add\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     68\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     69\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:846\u001b[0m, in \u001b[0;36mSimulation.calculate_add\u001b[0;34m(self, variable_name, period, decode_enums)\u001b[0m\n\u001b[1;32m    835\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable\u001b[38;5;241m.\u001b[39mdefinition_period \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m [\n\u001b[1;32m    836\u001b[0m     periods\u001b[38;5;241m.\u001b[39mDAY,\n\u001b[1;32m    837\u001b[0m     periods\u001b[38;5;241m.\u001b[39mMONTH,\n\u001b[1;32m    838\u001b[0m     periods\u001b[38;5;241m.\u001b[39mYEAR,\n\u001b[1;32m    839\u001b[0m ]:\n\u001b[1;32m    840\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    841\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to sum constant variable \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m over period \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m: only variables defined daily, monthly, or yearly can be summed over time.\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m    842\u001b[0m             variable\u001b[38;5;241m.\u001b[39mname, period\n\u001b[1;32m    843\u001b[0m         )\n\u001b[1;32m    844\u001b[0m     )\n\u001b[0;32m--> 846\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(\n\u001b[1;32m    847\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcalculate(variable_name, sub_period)\n\u001b[1;32m    848\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m sub_period \u001b[38;5;129;01min\u001b[39;00m period\u001b[38;5;241m.\u001b[39mget_subperiods(variable\u001b[38;5;241m.\u001b[39mdefinition_period)\n\u001b[1;32m    849\u001b[0m )\n\u001b[1;32m    850\u001b[0m holder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_holder(variable\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m    851\u001b[0m holder\u001b[38;5;241m.\u001b[39mput_in_cache(result, period, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbranch_name)\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:847\u001b[0m, in \u001b[0;36m<genexpr>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    835\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable\u001b[38;5;241m.\u001b[39mdefinition_period \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m [\n\u001b[1;32m    836\u001b[0m     periods\u001b[38;5;241m.\u001b[39mDAY,\n\u001b[1;32m    837\u001b[0m     periods\u001b[38;5;241m.\u001b[39mMONTH,\n\u001b[1;32m    838\u001b[0m     periods\u001b[38;5;241m.\u001b[39mYEAR,\n\u001b[1;32m    839\u001b[0m ]:\n\u001b[1;32m    840\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    841\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to sum constant variable \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m over period \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m: only variables defined daily, monthly, or yearly can be summed over time.\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m    842\u001b[0m             variable\u001b[38;5;241m.\u001b[39mname, period\n\u001b[1;32m    843\u001b[0m         )\n\u001b[1;32m    844\u001b[0m     )\n\u001b[1;32m    846\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(\n\u001b[0;32m--> 847\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msub_period\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    848\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m sub_period \u001b[38;5;129;01min\u001b[39;00m period\u001b[38;5;241m.\u001b[39mget_subperiods(variable\u001b[38;5;241m.\u001b[39mdefinition_period)\n\u001b[1;32m    849\u001b[0m )\n\u001b[1;32m    850\u001b[0m holder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_holder(variable\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m    851\u001b[0m holder\u001b[38;5;241m.\u001b[39mput_in_cache(result, period, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbranch_name)\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:940\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m    938\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m adds_list:\n\u001b[1;32m    939\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mvariables:\n\u001b[0;32m--> 940\u001b[0m         values \u001b[38;5;241m=\u001b[39m values \u001b[38;5;241m+\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    941\u001b[0m \u001b[43m            \u001b[49m\u001b[43madded_variable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mentity\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkey\u001b[49m\n\u001b[1;32m    942\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    943\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    944\u001b[0m         \u001b[38;5;28;01mtry\u001b[39;00m:\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     53\u001b[0m     period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m values\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m    482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m    484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    486\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m    487\u001b[0m         result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m    713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    714\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    717\u001b[0m     \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m    718\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m         \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m   1003\u001b[0m     array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m   1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n",
+      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/aca/slspc/slcsp_age_curve_amount_person.py:27\u001b[0m, in \u001b[0;36mslcsp_age_curve_amount_person.formula\u001b[0;34m(person, period, parameters)\u001b[0m\n\u001b[1;32m     19\u001b[0m p \u001b[38;5;241m=\u001b[39m parameters(period)\u001b[38;5;241m.\u001b[39mgov\u001b[38;5;241m.\u001b[39maca\u001b[38;5;241m.\u001b[39mage_curves\n\u001b[1;32m     21\u001b[0m \u001b[38;5;66;03m# Handle other states with regular bracket structures\u001b[39;00m\n\u001b[1;32m     22\u001b[0m multiplier \u001b[38;5;241m=\u001b[39m select(\n\u001b[1;32m     23\u001b[0m     [\n\u001b[1;32m     24\u001b[0m         state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAL\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     25\u001b[0m         state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDC\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     26\u001b[0m         state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMA\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m---> 27\u001b[0m         \u001b[43mstate_code\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mMN\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m,\n\u001b[1;32m     28\u001b[0m         state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMS\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     29\u001b[0m         state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOR\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     30\u001b[0m         state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUT\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     31\u001b[0m     ],\n\u001b[1;32m     32\u001b[0m     [\n\u001b[1;32m     33\u001b[0m         p\u001b[38;5;241m.\u001b[39mal\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m     34\u001b[0m         p\u001b[38;5;241m.\u001b[39mdc\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m     35\u001b[0m         p\u001b[38;5;241m.\u001b[39mma\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m     36\u001b[0m         p\u001b[38;5;241m.\u001b[39mmn\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m     37\u001b[0m         p\u001b[38;5;241m.\u001b[39mms\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m     38\u001b[0m         p[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m     39\u001b[0m         p\u001b[38;5;241m.\u001b[39mut\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m     40\u001b[0m     ],\n\u001b[1;32m     41\u001b[0m     default\u001b[38;5;241m=\u001b[39mp\u001b[38;5;241m.\u001b[39mdefault\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m     42\u001b[0m )\n\u001b[1;32m     43\u001b[0m age_curve_applies \u001b[38;5;241m=\u001b[39m person\u001b[38;5;241m.\u001b[39mtax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mslcsp_age_curve_applies\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m     44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m base_cost \u001b[38;5;241m*\u001b[39m multiplier \u001b[38;5;241m*\u001b[39m age_curve_applies\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
      ]
     }
    ],
@@ -395,10 +291,10 @@
     "\n",
     "optional_params = {\n",
     "        \"random_sampling_minimization\": {\n",
-    "            \"target_fractions\": [0.5, 0.6]#, 0.7, 0.8, 0.9],  # fractions of the dataset to keep\n",
+    "            \"target_fractions\": [0.7, 0.8, 0.9]#, 0.5, 0.6]],  # fractions of the dataset to keep\n",
     "        },\n",
     "        \"candidate_loss_contribution\": {\n",
-    "            \"loss_rel_change_max\": [0.001, 0.0001]#, 0.00001, 0.000001, 0.0000001] # maximum relative change in loss\n",
+    "            \"loss_rel_change_max\": [0.00001, 0.000001, 0.0000001]#, 0.001, 0.0001]] # maximum relative change in loss\n",
     "        }\n",
     "}\n",
     "\n",
@@ -449,7 +345,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 25,
    "id": "225debd8",
    "metadata": {},
    "outputs": [
@@ -497,7 +393,7 @@
        "0     none      none         41310      0.0069"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -552,89 +448,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "id": "7bb3ef3c",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>strategy</th>\n",
-       "      <th>parameter</th>\n",
-       "      <th>dataset_size</th>\n",
-       "      <th>total_loss</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>none</td>\n",
-       "      <td>none</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>0.006900</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>random_sampling_minimization</td>\n",
-       "      <td>0.5</td>\n",
-       "      <td>20655</td>\n",
-       "      <td>80.882353</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>random_sampling_minimization</td>\n",
-       "      <td>0.6</td>\n",
-       "      <td>24786</td>\n",
-       "      <td>80.882353</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>candidate_loss_contribution</td>\n",
-       "      <td>0.001</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>80.882353</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>candidate_loss_contribution</td>\n",
-       "      <td>0.0001</td>\n",
-       "      <td>41310</td>\n",
-       "      <td>80.882353</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                       strategy parameter  dataset_size  total_loss\n",
-       "0                          none      none         41310    0.006900\n",
-       "1  random_sampling_minimization       0.5         20655   80.882353\n",
-       "2  random_sampling_minimization       0.6         24786   80.882353\n",
-       "3   candidate_loss_contribution     0.001         41310   80.882353\n",
-       "4   candidate_loss_contribution    0.0001         41310   80.882353"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
+     "ename": "AttributeError",
+     "evalue": "module 'h5py' has no attribute 'Files'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[26], line 43\u001b[0m\n\u001b[1;32m     41\u001b[0m h5_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfraction\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     42\u001b[0m h5_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m strategy \u001b[38;5;241m/\u001b[39m h5_name\n\u001b[0;32m---> 43\u001b[0m dataset_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[43mh5py\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mFiles\u001b[49m(h5_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhousehold_weight/2024\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m     45\u001b[0m \u001b[38;5;66;03m# Pull sum of loss column\u001b[39;00m\n\u001b[1;32m     46\u001b[0m cal_log_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfraction\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised_calibration_log.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "\u001b[0;31mAttributeError\u001b[0m: module 'h5py' has no attribute 'Files'"
+     ]
     }
    ],
    "source": [
@@ -684,7 +511,7 @@
     "\n",
     "        # Pull sum of loss column\n",
     "        cal_log_name = f\"{fraction}_enhanced_cps_2024_minimised_calibration_log.csv\"\n",
-    "        cal_log_name = get_output_path(strategy, cal_log_name)\n",
+    "        cal_log_path = get_output_path(strategy, cal_log_name)\n",
     "        calibration_log = pd.read_csv(cal_log_path)\n",
     "        loss_value = loss_score(calibration_log)\n",
     "\n",

From 9520b16ce9777cfe736db893c0a35c92a19dce7a Mon Sep 17 00:00:00 2001
From: eccuraa <elenaccura@gmail.com>
Date: Wed, 16 Jul 2025 11:03:48 -0400
Subject: [PATCH 58/58] added pruning to L0, L1 approaches (and discovered
 candidate_loss approach is not being pruned yet either)

---
 .../datasets/cps/enhanced_cps.py              |  16 +-
 test_minimization_approach.ipynb              | 421 ++++++------------
 2 files changed, 143 insertions(+), 294 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 59abeafa..2fbb0293 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -39,7 +39,7 @@ def reweight(
     loss_matrix,
     targets_array,
     dropout_rate=0.05,
-    epochs=250,
+    epochs=150,
     log_path="calibration_log.csv",
     penalty_approach=None,
     penalty_weight=None,
@@ -108,11 +108,21 @@ def loss(
             if penalty_approach == "l1":
                 l1 = torch.mean(weights)
                 return rel_error_normalized.mean() + penalty_weight * l1
-
+            
             return rel_error_normalized.mean() + penalty_weight * smoothed_l0
 
         else:
             return rel_error_normalized.mean()
+        
+    def prune_dataset(weights, epsilon=1e-3):
+        """
+        Prune dataset samples based on learned weights.
+        Returns indices of samples to keep.
+        """
+        importance_scores = weights.detach().cpu().numpy()
+        keep_indices = np.where(importance_scores > epsilon)[0]
+
+        return keep_indices
 
     def dropout_weights(weights, p):
         if p == 0:
@@ -270,7 +280,7 @@ def generate(self):
                 loss_matrix_clean,
                 targets_array_clean,
                 log_path="calibration_log.csv",
-                epochs=250,
+                epochs= 150,
             )
             data["household_weight"][year] = optimised_weights
 
diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb
index 4fcb8b91..a4bd87be 100644
--- a/test_minimization_approach.ipynb
+++ b/test_minimization_approach.ipynb
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 34,
    "id": "d6dc9cca",
    "metadata": {},
    "outputs": [],
@@ -76,100 +76,15 @@
    "execution_count": null,
    "id": "db975ac1",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 10/10 [00:03<00:00,  3.00it/s, loss=9.1e-5, loss_rel_change=-0.809] \n",
-      "100%|██████████| 10/10 [00:03<00:00,  2.96it/s, loss=0.000181, loss_rel_change=-0.679]\n",
-      "100%|██████████| 10/10 [00:03<00:00,  2.98it/s, loss=0.00108, loss_rel_change=-0.273]\n",
-      "100%|██████████| 10/10 [00:02<00:00,  3.59it/s, loss=0.0101, loss_rel_change=-0.0377]\n",
-      "100%|██████████| 10/10 [00:02<00:00,  3.46it/s, loss=0.1, loss_rel_change=-0.00391]\n",
-      "100%|██████████| 10/10 [00:02<00:00,  3.52it/s, loss=0.000191, loss_rel_change=-0.672]\n",
-      "100%|██████████| 10/10 [00:03<00:00,  2.89it/s, loss=0.00116, loss_rel_change=-0.274]\n",
-      "100%|██████████| 10/10 [00:02<00:00,  3.57it/s, loss=0.00978, loss_rel_change=-0.166]\n",
-      "100%|██████████| 10/10 [00:02<00:00,  3.34it/s, loss=0.0881, loss_rel_change=-0.22]\n",
-      "100%|██████████| 10/10 [00:02<00:00,  3.55it/s, loss=0.866, loss_rel_change=-0.23]\n",
-      "100%|██████████| 10/10 [00:03<00:00,  3.31it/s, loss=9.12e-5, loss_rel_change=-0.812]\n",
-      "100%|██████████| 10/10 [00:03<00:00,  3.26it/s, loss=0.00018, loss_rel_change=-0.687]\n",
-      "100%|██████████| 10/10 [00:02<00:00,  3.47it/s, loss=0.00108, loss_rel_change=-0.263]\n",
-      "100%|██████████| 10/10 [00:03<00:00,  3.21it/s, loss=0.0101, loss_rel_change=-0.0373]\n",
-      "100%|██████████| 10/10 [00:02<00:00,  3.37it/s, loss=0.1, loss_rel_change=-0.00383]\n",
-      "100%|██████████| 10/10 [00:03<00:00,  3.28it/s, loss=0.00389, loss_rel_change=-0.875]\n",
-      "100%|██████████| 10/10 [00:03<00:00,  3.17it/s, loss=0.0328, loss_rel_change=-0.894]\n",
-      "100%|██████████| 10/10 [00:03<00:00,  2.72it/s, loss=0.321, loss_rel_change=-0.896]\n",
-      "100%|██████████| 10/10 [00:02<00:00,  3.49it/s, loss=3.21, loss_rel_change=-0.896]\n",
-      "100%|██████████| 10/10 [00:02<00:00,  3.37it/s, loss=32.1, loss_rel_change=-0.896]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "=== CALIBRATION LOG DEBUG ===\n",
-      "File path: /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5\n",
-      "Epoch: 0\n",
-      "Number of households: 41310\n",
-      "Total weight: 12764381616743.21\n",
-      "Weight range: 0.54 to 1303728.75\n",
-      "Loss matrix shape: (41310, 2813)\n",
-      "Number of targets: 2813\n",
-      "After filtering bad targets:\n",
-      "Loss matrix clean shape: (41310, 2805)\n",
-      "Number of clean targets: 2805\n",
-      "Estimates shape: (2805,)\n",
-      "Estimates sum: 324584770671300.88\n",
-      "First 3 estimates: nation/irs/adjusted gross income/total/AGI in -inf-inf/taxable/All    1.498784e+13\n",
-      "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/All     1.609638e+10\n",
-      "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/All     6.707770e+10\n",
-      "dtype: float64\n",
-      "First 3 targets: [1.62972204e+13 1.68634879e+10 6.76819729e+10]\n",
-      "Mean absolute error: 17235490830.73\n",
-      "Mean relative error: 0.0997\n",
-      "=== END DEBUG ===\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 250/250 [01:38<00:00,  2.54it/s, loss=3.62e-5, loss_rel_change=-0.301]\n",
-      "100%|██████████| 250/250 [01:35<00:00,  2.62it/s, loss=3.58e-5, loss_rel_change=-0.294]\n",
-      "100%|██████████| 250/250 [01:33<00:00,  2.68it/s, loss=3.34e-5, loss_rel_change=-0.376]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Weight relative change: 99.95%\n",
-      "Re-calibrating final selected households...\n"
-     ]
-    },
-    {
-     "ename": "AttributeError",
-     "evalue": "'numpy.ndarray' object has no attribute 'columns'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[3], line 92\u001b[0m\n\u001b[1;32m     90\u001b[0m         output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     91\u001b[0m         output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 92\u001b[0m         \u001b[43mminimise_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     93\u001b[0m \u001b[43m            \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     94\u001b[0m \u001b[43m            \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     95\u001b[0m \u001b[43m            \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m     96\u001b[0m \u001b[43m            \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m     97\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     98\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m     99\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimise.py:430\u001b[0m, in \u001b[0;36mminimise_dataset\u001b[0;34m(dataset, output_path, minimization_function, **kwargs)\u001b[0m\n\u001b[1;32m    428\u001b[0m \u001b[38;5;66;03m# Re-calibrate the final selected households to hit targets\u001b[39;00m\n\u001b[1;32m    429\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRe-calibrating final selected households...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 430\u001b[0m calibrated_weights \u001b[38;5;241m=\u001b[39m \u001b[43mreweight\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    431\u001b[0m \u001b[43m    \u001b[49m\u001b[43minitial_weights\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    432\u001b[0m \u001b[43m    \u001b[49m\u001b[43mloss_matrix_clean\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Convert to numpy array\u001b[39;49;00m\n\u001b[1;32m    433\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtargets_clean\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    434\u001b[0m \u001b[43m    \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m250\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Reduced epochs for faster processing\u001b[39;49;00m\n\u001b[1;32m    435\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    436\u001b[0m sim\u001b[38;5;241m.\u001b[39mset_input(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m2024\u001b[39m, calibrated_weights)\n\u001b[1;32m    437\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFinal calibration completed successfully\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py:47\u001b[0m, in \u001b[0;36mreweight\u001b[0;34m(original_weights, loss_matrix, targets_array, dropout_rate, epochs, log_path, penalty_approach, penalty_weight)\u001b[0m\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mreweight\u001b[39m(\n\u001b[1;32m     38\u001b[0m     original_weights,\n\u001b[1;32m     39\u001b[0m     loss_matrix,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     45\u001b[0m     penalty_weight\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     46\u001b[0m ):\n\u001b[0;32m---> 47\u001b[0m     target_names \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray(\u001b[43mloss_matrix\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m)\n\u001b[1;32m     48\u001b[0m     is_national \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnation/\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     49\u001b[0m     loss_matrix \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(loss_matrix\u001b[38;5;241m.\u001b[39mvalues, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "## ALL TESTS\n",
     "\n",
     "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n",
     "input_dataset = ExtendedCPS_2024\n",
     "\n",
-    "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
-    "penalty_weights = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]\n",
+    "approaches = [\"l0_sigmoid\"]#, \"l0_log\", \"l0_exp\", \"l1\"]\n",
+    "penalty_weights = [1e-5]#, 1e-4, 1e-3, 1e-2, 1e-1]\n",
     "\n",
     "def get_output_path(approach, file_name):\n",
     "    output_path = STORAGE_FOLDER / approach / file_name\n",
@@ -214,7 +129,7 @@
     "                        penalty_weight=penalty_weight, \n",
     "                        epochs=250,  # Reduced epochs for faster processing\n",
     "                    )\n",
-    "                    keep_indices = prune_dataset(optimised_weights, epsilon=1e-3, method=\"threshold\")\n",
+    "                    keep_indices = prune_dataset(optimised_weights, epsilon=1e-3)\n",
     "                    pruned_weights = optimised_weights[keep_indices]\n",
     "                    \n",
     "                    data[\"household_weight\"][year] = pruned_weights\n",
@@ -345,7 +260,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 30,
    "id": "225debd8",
    "metadata": {},
    "outputs": [
@@ -393,7 +308,7 @@
        "0     none      none         41310      0.0069"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -453,27 +368,113 @@
    "metadata": {},
    "outputs": [
     {
-     "ename": "AttributeError",
-     "evalue": "module 'h5py' has no attribute 'Files'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[26], line 43\u001b[0m\n\u001b[1;32m     41\u001b[0m h5_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfraction\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     42\u001b[0m h5_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m strategy \u001b[38;5;241m/\u001b[39m h5_name\n\u001b[0;32m---> 43\u001b[0m dataset_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[43mh5py\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mFiles\u001b[49m(h5_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhousehold_weight/2024\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m     45\u001b[0m \u001b[38;5;66;03m# Pull sum of loss column\u001b[39;00m\n\u001b[1;32m     46\u001b[0m cal_log_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfraction\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised_calibration_log.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
-      "\u001b[0;31mAttributeError\u001b[0m: module 'h5py' has no attribute 'Files'"
-     ]
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>strategy</th>\n",
+       "      <th>parameter</th>\n",
+       "      <th>dataset_size</th>\n",
+       "      <th>total_loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>none</td>\n",
+       "      <td>none</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>0.006900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>candidate_loss_contribution</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>0.006900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>random_sampling_minimization</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>20655</td>\n",
+       "      <td>80.882353</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>random_sampling_minimization</td>\n",
+       "      <td>0.6</td>\n",
+       "      <td>24786</td>\n",
+       "      <td>79.117647</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>random_sampling_minimization</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>0.006900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>candidate_loss_contribution</td>\n",
+       "      <td>0.001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>77.647059</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>candidate_loss_contribution</td>\n",
+       "      <td>0.0001</td>\n",
+       "      <td>41310</td>\n",
+       "      <td>80.196078</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                       strategy parameter  dataset_size  total_loss\n",
+       "0                          none      none         41310    0.006900\n",
+       "1   candidate_loss_contribution       1.0         41310    0.006900\n",
+       "2  random_sampling_minimization       0.5         20655   80.882353\n",
+       "3  random_sampling_minimization       0.6         24786   79.117647\n",
+       "4  random_sampling_minimization       1.0         41310    0.006900\n",
+       "5   candidate_loss_contribution     0.001         41310   77.647059\n",
+       "6   candidate_loss_contribution    0.0001         41310   80.196078"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "\"\"\"\n",
     "Scraping values from created calibration_log.csv and .h5 files to populate the plotting dataframe\n",
     "\"\"\"\n",
-    "'''\n",
-    "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
-    "penalty_weights = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]\n",
+    "\n",
     "og_size = 41310  # Original size of the dataset\n",
     "og_loss = 6.9e-3  # Original loss from the baseline dataset\n",
     "\n",
+    "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n",
+    "penalty_weights = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]\n",
+    "\n",
     "for approach in approaches:\n",
     "    strategy = approach\n",
     "    reg_results_df = add_result(reg_results_df, strategy, 1.0, og_size, og_loss)\n",
@@ -492,7 +493,6 @@
     "        loss_value = loss_score(calibration_log)\n",
     "        \n",
     "        reg_results_df = add_result(reg_results_df, strategy, parameter, dataset_size, loss_value)\n",
-    "'''\n",
     "\n",
     "approaches = {\n",
     "    \"random_sampling_minimization\":[0.5, 0.6], #, 0.7, 0.8, 0.9], \n",
@@ -500,6 +500,7 @@
     "}\n",
     "\n",
     "for approach, fractions in approaches.items():  # Use .items() to get key-value pairs\n",
+    "    reg_results_df = add_result(reg_results_df, strategy, 1.0, og_size, og_loss)\n",
     "    for fraction in fractions:\n",
     "        strategy = approach\n",
     "        parameter = fraction\n",
@@ -530,7 +531,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 32,
    "id": "2dc0891c",
    "metadata": {},
    "outputs": [
@@ -544,26 +545,17 @@
         {
          "customdata": [
           [
-           "l0_exp"
-          ],
-          [
-           "l0_exp"
-          ],
-          [
-           "l0_exp"
-          ],
-          [
-           "l0_exp"
+           "candidate_loss_contribution"
           ],
           [
-           "l0_exp"
+           "candidate_loss_contribution"
           ],
           [
-           "l0_exp"
+           "candidate_loss_contribution"
           ]
          ],
          "hovertemplate": "Strategy: %{customdata[0]}<br>Size: %{x}<br>Loss: %{y:.4f}<br> Param: %{text}",
-         "legendgroup": "l0_exp",
+         "legendgroup": "candidate_loss_contribution",
          "line": {
           "color": "#636efa",
           "dash": "solid"
@@ -572,224 +564,71 @@
           "symbol": "circle"
          },
          "mode": "lines+markers+text",
-         "name": "l0_exp",
+         "name": "candidate_loss_contribution",
          "orientation": "v",
          "showlegend": true,
          "text": [
           "1.0",
-          "0.1",
-          "0.01",
           "0.001",
-          "0.0001",
-          "1e-05"
+          "0.0001"
          ],
          "textposition": "top center",
          "type": "scatter",
          "x": [
           41310,
-          37048,
-          32786,
-          28524,
-          24262,
-          20000
-         ],
-         "xaxis": "x",
-         "y": [
-          0.0069,
-          11.6014,
-          26.644,
-          43.3377,
-          61.2029,
-          79.9931
-         ],
-         "yaxis": "y"
-        },
-        {
-         "customdata": [
-          [
-           "l0_log"
-          ],
-          [
-           "l0_log"
-          ],
-          [
-           "l0_log"
-          ],
-          [
-           "l0_log"
-          ],
-          [
-           "l0_log"
-          ],
-          [
-           "l0_log"
-          ]
-         ],
-         "hovertemplate": "Strategy: %{customdata[0]}<br>Size: %{x}<br>Loss: %{y:.4f}<br> Param: %{text}",
-         "legendgroup": "l0_log",
-         "line": {
-          "color": "#EF553B",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines+markers+text",
-         "name": "l0_log",
-         "orientation": "v",
-         "showlegend": true,
-         "text": [
-          "1.0",
-          "0.1",
-          "0.01",
-          "0.001",
-          "0.0001",
-          "1e-05"
-         ],
-         "textposition": "top center",
-         "type": "scatter",
-         "x": [
           41310,
-          37048,
-          32786,
-          28524,
-          24262,
-          20000
+          41310
          ],
          "xaxis": "x",
          "y": [
           0.0069,
-          8.7028,
-          19.9847,
-          32.505,
-          45.9039,
-          59.9965
+          77.6470588235294,
+          80.19607843137256
          ],
          "yaxis": "y"
         },
         {
          "customdata": [
           [
-           "l0_sigmoid"
+           "random_sampling_minimization"
           ],
           [
-           "l0_sigmoid"
+           "random_sampling_minimization"
           ],
           [
-           "l0_sigmoid"
-          ],
-          [
-           "l0_sigmoid"
-          ],
-          [
-           "l0_sigmoid"
-          ],
-          [
-           "l0_sigmoid"
+           "random_sampling_minimization"
           ]
          ],
          "hovertemplate": "Strategy: %{customdata[0]}<br>Size: %{x}<br>Loss: %{y:.4f}<br> Param: %{text}",
-         "legendgroup": "l0_sigmoid",
+         "legendgroup": "random_sampling_minimization",
          "line": {
-          "color": "#00cc96",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines+markers+text",
-         "name": "l0_sigmoid",
-         "orientation": "v",
-         "showlegend": true,
-         "text": [
-          "1.0",
-          "0.1",
-          "0.01",
-          "0.001",
-          "0.0001",
-          "1e-05"
-         ],
-         "textposition": "top center",
-         "type": "scatter",
-         "x": [
-          41310,
-          37048,
-          32786,
-          28524,
-          24262,
-          20000
-         ],
-         "xaxis": "x",
-         "y": [
-          0.0069,
-          5.8041,
-          13.3255,
-          21.6723,
-          30.6049,
-          40
-         ],
-         "yaxis": "y"
-        },
-        {
-         "customdata": [
-          [
-           "l1"
-          ],
-          [
-           "l1"
-          ],
-          [
-           "l1"
-          ],
-          [
-           "l1"
-          ],
-          [
-           "l1"
-          ],
-          [
-           "l1"
-          ]
-         ],
-         "hovertemplate": "Strategy: %{customdata[0]}<br>Size: %{x}<br>Loss: %{y:.4f}<br> Param: %{text}",
-         "legendgroup": "l1",
-         "line": {
-          "color": "#ab63fa",
+          "color": "#EF553B",
           "dash": "solid"
          },
          "marker": {
           "symbol": "circle"
          },
          "mode": "lines+markers+text",
-         "name": "l1",
+         "name": "random_sampling_minimization",
          "orientation": "v",
          "showlegend": true,
          "text": [
           "1.0",
-          "0.1",
-          "0.01",
-          "0.001",
-          "0.0001",
-          "1e-05"
+          "0.6",
+          "0.5"
          ],
          "textposition": "top center",
          "type": "scatter",
          "x": [
           41310,
-          37048,
-          32786,
-          28524,
-          24262,
-          20000
+          24786,
+          20655
          ],
          "xaxis": "x",
          "y": [
           0.0069,
-          14.5,
-          33.3033,
-          54.1704,
-          76.5019,
-          99.9896
+          79.11764705882354,
+          80.88235294117646
          ],
          "yaxis": "y"
         }
@@ -828,7 +667,7 @@
           "x0": 41310,
           "x1": 41310,
           "y0": 0.0069,
-          "y1": 99.9896
+          "y1": 80.88235294117646
          },
          {
           "line": {
@@ -837,7 +676,7 @@
           },
           "name": "Baseline Loss",
           "type": "line",
-          "x0": 20000,
+          "x0": 20655,
           "x1": 41310,
           "y0": 0.0069,
           "y1": 0.0069