From 4c0f1c611e85436fe5b1c0e1c87deb386846d761 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 10 Jul 2025 21:20:39 +0100 Subject: [PATCH 01/58] Shrink datasets --- policyengine_us_data/utils/minimise.py | 85 ++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 policyengine_us_data/utils/minimise.py diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py new file mode 100644 index 00000000..4355e889 --- /dev/null +++ b/policyengine_us_data/utils/minimise.py @@ -0,0 +1,85 @@ +from policyengine_us_data.utils.loss import build_loss_matrix +from policyengine_core.data import Dataset +from policyengine_us import Microsimulation +import numpy as np +import pandas as pd + +def minimise_dataset(dataset, output_path: str, loss_rel_change_max: float) -> None: + # if loading from a .h5 file, need to do dataset = Dataset.from_file(dataset) + loss_matrix = build_loss_matrix(dataset, 2024) + + sim = Microsimulation(dataset=dataset) + + weights = sim.calculate("household_weight", 2024).values + estimate_matrix, targets = loss_matrix + is_national = estimate_matrix.columns.str.startswith("nation/") + nation_normalisation_factor = is_national * (1 / is_national.sum()) + state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) + normalisation_factor = np.where( + is_national, nation_normalisation_factor, state_normalisation_factor + ) + weights @ estimate_matrix + + def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor): + """ + Calculate the loss based on the inclusion mask and the estimate matrix. + """ + masked_weights = weights.copy() + original_weight_total = masked_weights.sum() + masked_weights[~inclusion_mask] = 0 + masked_weight_total = masked_weights.sum() + masked_weights[inclusion_mask] *= original_weight_total / masked_weight_total + estimates = masked_weights @ estimate_matrix + rel_error = ((estimates - targets) + 1) / (targets + 1) + loss = ((rel_error * normalisation_factor) ** 2).mean() + + return loss + + COUNT_ITERATIONS = 5 + FRACTION_REMOVE_PER_ITERATION = 0.1 + from tqdm import tqdm + + full_mask = np.ones_like(weights, dtype=bool) + for i in range(COUNT_ITERATIONS): + inclusion_mask = full_mask.copy() + baseline_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor) + household_loss_rel_changes = [] + for household_index in tqdm(range(len(weights))): + # Skip if this household is already excluded + if not inclusion_mask[household_index]: + household_loss_rel_changes.append(np.inf) + continue + # Calculate loss if this household is removed + inclusion_mask = inclusion_mask.copy() + inclusion_mask[household_index] = False + loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor) + rel_change = (loss - baseline_loss) / baseline_loss + household_loss_rel_changes.append(rel_change) + inclusion_mask = full_mask.copy() + household_loss_rel_changes = np.array(household_loss_rel_changes) + # Sort by the relative change in loss + sorted_indices = np.argsort(household_loss_rel_changes) + # Remove the worst households + num_to_remove = int(len(weights) * FRACTION_REMOVE_PER_ITERATION) + worst_indices = sorted_indices[:num_to_remove] + inclusion_mask[worst_indices] = False + # Calculate the new loss + new_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor) + print(f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}") + print(f"Removed {num_to_remove} households with worst relative loss changes.") + # Update the full mask + full_mask &= inclusion_mask + + household_ids = sim.calculate("household_id", 2024).values + remaining_households = household_ids[full_mask] + + # At this point we have a mask of households to keep + + # I'm saving to a csv for ease of debugging, but we need to save to a .h5 file + + df = sim.to_input_dataframe() + df = df[df["household_id__2024"].isin(remaining_households)] + + df.to_csv(output_path, index=False) + + return df \ No newline at end of file From 6b2a56f6f8a55aacb4ee9e305bd53c74f36c70b0 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 10 Jul 2025 22:25:27 +0100 Subject: [PATCH 02/58] Move to package --- Makefile | 1 + .../storage/upload_completed_datasets.py | 1 + policyengine_us_data/utils/minimise.py | 127 +++++++++++++++--- 3 files changed, 114 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index 788ba1d3..90b2817a 100644 --- a/Makefile +++ b/Makefile @@ -46,6 +46,7 @@ data: python policyengine_us_data/datasets/cps/extended_cps.py python policyengine_us_data/datasets/cps/enhanced_cps.py python policyengine_us_data/datasets/cps/small_enhanced_cps.py + python policyengine_us_data/utils/minimise.py clean: rm -f policyengine_us_data/storage/*.h5 diff --git a/policyengine_us_data/storage/upload_completed_datasets.py b/policyengine_us_data/storage/upload_completed_datasets.py index f161a9ee..16885d8c 100644 --- a/policyengine_us_data/storage/upload_completed_datasets.py +++ b/policyengine_us_data/storage/upload_completed_datasets.py @@ -15,6 +15,7 @@ def upload_datasets(): Pooled_3_Year_CPS_2023.file_path, CPS_2023.file_path, STORAGE_FOLDER / "small_enhanced_cps_2024.h5", + STORAGE_FOLDER / "enhanced_cps_2024_minified.h5", ] for file_path in dataset_files: diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 4355e889..6fe511fd 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -3,9 +3,43 @@ from policyengine_us import Microsimulation import numpy as np import pandas as pd +import h5py +from policyengine_us_data.storage import STORAGE_FOLDER + + +def create_calibration_log_file(file_path): + dataset = Dataset.from_file(file_path) + + loss_matrix = build_loss_matrix(dataset, 2024) + + sim = Microsimulation(dataset=dataset) + + estimates = sim.calculate("household_weight", 2024).values @ loss_matrix[0] + target_names = loss_matrix[0].columns + target_values = loss_matrix[1] + + df = pd.DataFrame( + { + "target_name": target_names, + "estimate": estimates, + "target": target_values, + } + ) + df["epoch"] = 0 + df["error"] = df["estimate"] - df["target"] + df["rel_error"] = df["error"] / df["target"] + df["abs_error"] = df["error"].abs() + df["rel_abs_error"] = df["abs_error"] / df["target"].abs() + df["loss"] = (df["rel_error"] ** 2).mean() + + df.to_csv(file_path.replace(".h5", "_calibration_log.csv"), index=False) + + +def minimise_dataset( + dataset, output_path: str, loss_rel_change_max: float +) -> None: + create_calibration_log_file(dataset) -def minimise_dataset(dataset, output_path: str, loss_rel_change_max: float) -> None: - # if loading from a .h5 file, need to do dataset = Dataset.from_file(dataset) loss_matrix = build_loss_matrix(dataset, 2024) sim = Microsimulation(dataset=dataset) @@ -20,15 +54,20 @@ def minimise_dataset(dataset, output_path: str, loss_rel_change_max: float) -> N ) weights @ estimate_matrix - def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor): + def get_loss_from_mask( + inclusion_mask, estimate_matrix, targets, normalisation_factor + ): """ Calculate the loss based on the inclusion mask and the estimate matrix. """ masked_weights = weights.copy() original_weight_total = masked_weights.sum() - masked_weights[~inclusion_mask] = 0 + if (~inclusion_mask).sum() > 0: + masked_weights[~inclusion_mask] = 0 masked_weight_total = masked_weights.sum() - masked_weights[inclusion_mask] *= original_weight_total / masked_weight_total + masked_weights[inclusion_mask] *= ( + original_weight_total / masked_weight_total + ) estimates = masked_weights @ estimate_matrix rel_error = ((estimates - targets) + 1) / (targets + 1) loss = ((rel_error * normalisation_factor) ** 2).mean() @@ -36,15 +75,23 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f return loss COUNT_ITERATIONS = 5 + VIEW_FRACTION_PER_ITERATION = 0.3 FRACTION_REMOVE_PER_ITERATION = 0.1 from tqdm import tqdm full_mask = np.ones_like(weights, dtype=bool) for i in range(COUNT_ITERATIONS): inclusion_mask = full_mask.copy() - baseline_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor) + baseline_loss = get_loss_from_mask( + inclusion_mask, estimate_matrix, targets, normalisation_factor + ) household_loss_rel_changes = [] - for household_index in tqdm(range(len(weights))): + indices = np.random.choice( + np.arange(len(weights)), + size=int(len(weights) * VIEW_FRACTION_PER_ITERATION), + replace=False, + ) + for household_index in tqdm(indices): # Skip if this household is already excluded if not inclusion_mask[household_index]: household_loss_rel_changes.append(np.inf) @@ -52,7 +99,9 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f # Calculate loss if this household is removed inclusion_mask = inclusion_mask.copy() inclusion_mask[household_index] = False - loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor) + loss = get_loss_from_mask( + inclusion_mask, estimate_matrix, targets, normalisation_factor + ) rel_change = (loss - baseline_loss) / baseline_loss household_loss_rel_changes.append(rel_change) inclusion_mask = full_mask.copy() @@ -64,12 +113,24 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f worst_indices = sorted_indices[:num_to_remove] inclusion_mask[worst_indices] = False # Calculate the new loss - new_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor) - print(f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}") - print(f"Removed {num_to_remove} households with worst relative loss changes.") + new_loss = get_loss_from_mask( + inclusion_mask, estimate_matrix, targets, normalisation_factor + ) + rel_change = (new_loss - baseline_loss) / baseline_loss + if rel_change > loss_rel_change_max: + print( + f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, which is too high ({rel_change:.2%}). Stopping." + ) + break + print( + f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}" + ) + print( + f"Removed {num_to_remove} households with worst relative loss changes." + ) # Update the full mask full_mask &= inclusion_mask - + household_ids = sim.calculate("household_id", 2024).values remaining_households = household_ids[full_mask] @@ -78,8 +139,44 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f # I'm saving to a csv for ease of debugging, but we need to save to a .h5 file df = sim.to_input_dataframe() - df = df[df["household_id__2024"].isin(remaining_households)] + smaller_df = df[df["household_id__2024"].isin(remaining_households)] + + weight_rel_change = ( + smaller_df["household_weight__2024"].sum() + / df["household_weight__2024"].sum() + ) + print(f"Weight relative change: {weight_rel_change:.2%}") + + sim = Microsimulation(dataset=smaller_df) + + sim.set_input( + "household_weight", + 2024, + sim.calculate("household_weight", 2024).values / weight_rel_change, + ) + + data = {} + + for variable in sim.input_variables: + data[variable] = {2024: sim.calculate(variable, 2024).values} + if data[variable][2024].dtype == "object": + data[variable][2024] = data[variable][2024].astype("S") + + with h5py.File(output_path, "w") as f: + for variable, values in data.items(): + for year, value in values.items(): + f.create_dataset(f"{variable}/{year}", data=value) + print(f"Saved minimised dataset to {output_path}") + + create_calibration_log_file(output_path) + - df.to_csv(output_path, index=False) +if __name__ == "__main__": + # Example usage + files = [ + STORAGE_FOLDER / "enhanced_cps_2024.h5", + ] - return df \ No newline at end of file + for file in files: + output_path = file.with_name(file.stem + "_minimised.h5") + minimise_dataset(file, output_path, loss_rel_change_max=10) From 05ee7e4075293057756d24da0e23b36a6cfe3465 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 10 Jul 2025 23:50:52 +0100 Subject: [PATCH 03/58] Try L0 --- Makefile | 1 - .../datasets/cps/enhanced_cps.py | 18 +++++++++++++++++- policyengine_us_data/utils/minimise.py | 4 +++- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 90b2817a..788ba1d3 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,6 @@ data: python policyengine_us_data/datasets/cps/extended_cps.py python policyengine_us_data/datasets/cps/enhanced_cps.py python policyengine_us_data/datasets/cps/small_enhanced_cps.py - python policyengine_us_data/utils/minimise.py clean: rm -f policyengine_us_data/storage/*.h5 diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index b8af12ce..9e61414c 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -59,9 +59,25 @@ def loss(weights): ((estimate - targets_array) + 1) / (targets_array + 1) ) ** 2 rel_error_normalized = rel_error * normalisation_factor + + # L0 penalty (approximated with smooth function) + # Since L0 is non-differentiable, we use a smooth approximation + # Common approaches: + + # Option 1: Sigmoid approximation + epsilon = 1e-3 # Threshold for "near zero" + l0_penalty_weight = 1e-1 # Adjust this hyperparameter + smoothed_l0 = torch.sigmoid((weights - epsilon) / (epsilon * 0.1)).mean() + + # Option 2: Log-sum penalty (smoother) + # smoothed_l0 = torch.log(1 + actual_weights / epsilon).sum() / len(actual_weights) + + # Option 3: Exponential penalty + # smoothed_l0 = (1 - torch.exp(-actual_weights / epsilon)).mean() + if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") - return rel_error_normalized.mean() + return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 def dropout_weights(weights, p): if p == 0: diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 6fe511fd..2b122fec 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -32,14 +32,16 @@ def create_calibration_log_file(file_path): df["rel_abs_error"] = df["abs_error"] / df["target"].abs() df["loss"] = (df["rel_error"] ** 2).mean() - df.to_csv(file_path.replace(".h5", "_calibration_log.csv"), index=False) + df.to_csv(str(file_path).replace(".h5", "_calibration_log.csv"), index=False) def minimise_dataset( dataset, output_path: str, loss_rel_change_max: float ) -> None: + dataset = str(dataset) create_calibration_log_file(dataset) + dataset = Dataset.from_file(dataset) loss_matrix = build_loss_matrix(dataset, 2024) sim = Microsimulation(dataset=dataset) From e38c6479483c9b2fb0cca9939c881995267a10d7 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 10 Jul 2025 23:54:02 +0100 Subject: [PATCH 04/58] Format --- policyengine_us_data/datasets/cps/enhanced_cps.py | 10 ++++++---- policyengine_us_data/utils/minimise.py | 4 +++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 9e61414c..7d81a0c0 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -63,15 +63,17 @@ def loss(weights): # L0 penalty (approximated with smooth function) # Since L0 is non-differentiable, we use a smooth approximation # Common approaches: - + # Option 1: Sigmoid approximation epsilon = 1e-3 # Threshold for "near zero" l0_penalty_weight = 1e-1 # Adjust this hyperparameter - smoothed_l0 = torch.sigmoid((weights - epsilon) / (epsilon * 0.1)).mean() - + smoothed_l0 = torch.sigmoid( + (weights - epsilon) / (epsilon * 0.1) + ).mean() + # Option 2: Log-sum penalty (smoother) # smoothed_l0 = torch.log(1 + actual_weights / epsilon).sum() / len(actual_weights) - + # Option 3: Exponential penalty # smoothed_l0 = (1 - torch.exp(-actual_weights / epsilon)).mean() diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 2b122fec..186a7673 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -32,7 +32,9 @@ def create_calibration_log_file(file_path): df["rel_abs_error"] = df["abs_error"] / df["target"].abs() df["loss"] = (df["rel_error"] ** 2).mean() - df.to_csv(str(file_path).replace(".h5", "_calibration_log.csv"), index=False) + df.to_csv( + str(file_path).replace(".h5", "_calibration_log.csv"), index=False + ) def minimise_dataset( From bdf3d6d89d16ac396786899ce3e3233c0c46ceb4 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 11 Jul 2025 16:24:22 +0200 Subject: [PATCH 05/58] attempting to vectorize minimizing of ecps --- changelog_entry.yaml | 4 + .../datasets/cps/enhanced_cps.py | 27 +++--- policyengine_us_data/utils/minimise.py | 83 ++++++++++++++++--- 3 files changed, 91 insertions(+), 23 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..84eeb584 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Enhanced CPS minimizing tests. \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 7d81a0c0..bf303f7a 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -45,8 +45,8 @@ def reweight( np.log(original_weights), requires_grad=True, dtype=torch.float32 ) - # TODO: replace this with a call to the python reweight.py package. - def loss(weights): + # TO DO: replace this with a call to the python reweight.py package. + def loss(weights, penalty_approach="l0_sigmoid"): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -60,25 +60,32 @@ def loss(weights): ) ** 2 rel_error_normalized = rel_error * normalisation_factor + if torch.isnan(rel_error_normalized).any(): + raise ValueError("Relative error contains NaNs") + # L0 penalty (approximated with smooth function) # Since L0 is non-differentiable, we use a smooth approximation # Common approaches: - # Option 1: Sigmoid approximation epsilon = 1e-3 # Threshold for "near zero" l0_penalty_weight = 1e-1 # Adjust this hyperparameter - smoothed_l0 = torch.sigmoid( - (weights - epsilon) / (epsilon * 0.1) - ).mean() + + # Option 1: Sigmoid approximation + if penalty_approach == "l0_sigmoid": + smoothed_l0 = torch.sigmoid( + (weights - epsilon) / (epsilon * 0.1) + ).mean() # Option 2: Log-sum penalty (smoother) - # smoothed_l0 = torch.log(1 + actual_weights / epsilon).sum() / len(actual_weights) + if penalty_approach == "l0_log": + smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights) # Option 3: Exponential penalty - # smoothed_l0 = (1 - torch.exp(-actual_weights / epsilon)).mean() + if penalty_approach == "l0_exp": + smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() + + # L1 penalty - if torch.isnan(rel_error_normalized).any(): - raise ValueError("Relative error contains NaNs") return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 def dropout_weights(weights, p): diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 186a7673..94601d02 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -5,6 +5,7 @@ import pandas as pd import h5py from policyengine_us_data.storage import STORAGE_FOLDER +from typing import Optional def create_calibration_log_file(file_path): @@ -37,6 +38,57 @@ def create_calibration_log_file(file_path): ) +def losses_for_candidates( + base_weights: np.ndarray, + idxs: np.ndarray, + est_mat: np.ndarray, + targets: np.ndarray, + norm: np.ndarray, + chunk_size: Optional[int] = 25_000, +) -> np.ndarray: + """ + Return the loss value *for each* candidate deletion in `idxs` + in one matrix multiplication. + + Parameters + ---------- + base_weights : (n,) original weight vector + idxs : (k,) candidate row indices to zero-out + est_mat : (n, m) estimate matrix + targets : (m,) calibration targets + norm : (m,) normalisation factors + chunk_size : max number of candidates to process at once + + Returns + ------- + losses : (k,) loss if row i were removed (and weights rescaled) + """ + W = base_weights + total = W.sum() + k = len(idxs) + losses = np.empty(k, dtype=float) + + # Work through the candidate list in blocks + for start in range(0, k, chunk_size): + stop = min(start + chunk_size, k) + part = idxs[start:stop] # (p,) where p ≤ chunk_size + p = len(part) + + # Build the delta matrix only for this chunk + delta = np.zeros((p, len(W))) + delta[np.arange(p), part] = -W[part] + + keep_total = total + delta.sum(axis=1) # (p,) + delta *= (total / keep_total)[:, None] + + # Matrix–matrix multiply → one matrix multiplication per chunk + ests = (W + delta) @ est_mat # (p, m) + rel_err = ((ests - targets) + 1) / (targets + 1) + losses[start:stop] = ((rel_err * norm) ** 2).mean(axis=1) + + return losses + + def minimise_dataset( dataset, output_path: str, loss_rel_change_max: float ) -> None: @@ -95,19 +147,24 @@ def get_loss_from_mask( size=int(len(weights) * VIEW_FRACTION_PER_ITERATION), replace=False, ) - for household_index in tqdm(indices): - # Skip if this household is already excluded - if not inclusion_mask[household_index]: - household_loss_rel_changes.append(np.inf) - continue - # Calculate loss if this household is removed - inclusion_mask = inclusion_mask.copy() - inclusion_mask[household_index] = False - loss = get_loss_from_mask( - inclusion_mask, estimate_matrix, targets, normalisation_factor - ) - rel_change = (loss - baseline_loss) / baseline_loss - household_loss_rel_changes.append(rel_change) + + # more efficient approach to compute losses for candidate households to be removed + + # 1. sample only households that are currently *included* + indices = np.random.choice( + np.where(full_mask)[0], + size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION), + replace=False, + ) + # 2. compute losses for the batch in one shot + candidate_losses = losses_for_candidates( + weights, indices, estimate_matrix, targets, normalisation_factor + ) + # 3. convert to relative change vs. baseline + household_loss_rel_changes = ( + candidate_losses - baseline_loss + ) / baseline_loss + inclusion_mask = full_mask.copy() household_loss_rel_changes = np.array(household_loss_rel_changes) # Sort by the relative change in loss From 03e5d0d380494b698cbcb4af14b5c8eb256754d0 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 11 Jul 2025 16:52:43 +0200 Subject: [PATCH 06/58] adding random sampling minimization strategy --- policyengine_us_data/utils/minimise.py | 240 ++++++++++++++++++------- 1 file changed, 173 insertions(+), 67 deletions(-) diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 94601d02..45212905 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -5,7 +5,7 @@ import pandas as pd import h5py from policyengine_us_data.storage import STORAGE_FOLDER -from typing import Optional +from typing import Optional, Callable def create_calibration_log_file(file_path): @@ -89,116 +89,214 @@ def losses_for_candidates( return losses -def minimise_dataset( - dataset, output_path: str, loss_rel_change_max: float -) -> None: - dataset = str(dataset) - create_calibration_log_file(dataset) +def get_loss_from_mask( + weights, inclusion_mask, estimate_matrix, targets, normalisation_factor +): + """ + Calculate the loss based on the inclusion mask and the estimate matrix. + """ + masked_weights = weights.copy() + original_weight_total = masked_weights.sum() + if (~inclusion_mask).sum() > 0: + masked_weights[~inclusion_mask] = 0 + masked_weight_total = masked_weights.sum() + masked_weights[inclusion_mask] *= ( + original_weight_total / masked_weight_total + ) + estimates = masked_weights @ estimate_matrix + rel_error = ((estimates - targets) + 1) / (targets + 1) + loss = ((rel_error * normalisation_factor) ** 2).mean() - dataset = Dataset.from_file(dataset) - loss_matrix = build_loss_matrix(dataset, 2024) + return loss - sim = Microsimulation(dataset=dataset) - weights = sim.calculate("household_weight", 2024).values - estimate_matrix, targets = loss_matrix - is_national = estimate_matrix.columns.str.startswith("nation/") - nation_normalisation_factor = is_national * (1 / is_national.sum()) - state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) - normalisation_factor = np.where( - is_national, nation_normalisation_factor, state_normalisation_factor - ) - weights @ estimate_matrix - - def get_loss_from_mask( - inclusion_mask, estimate_matrix, targets, normalisation_factor - ): - """ - Calculate the loss based on the inclusion mask and the estimate matrix. - """ - masked_weights = weights.copy() - original_weight_total = masked_weights.sum() - if (~inclusion_mask).sum() > 0: - masked_weights[~inclusion_mask] = 0 - masked_weight_total = masked_weights.sum() - masked_weights[inclusion_mask] *= ( - original_weight_total / masked_weight_total - ) - estimates = masked_weights @ estimate_matrix - rel_error = ((estimates - targets) + 1) / (targets + 1) - loss = ((rel_error * normalisation_factor) ** 2).mean() +def candidate_loss_contribution( + weights: np.ndarray, + estimate_matrix: np.ndarray, + targets: np.ndarray, + normalisation_factor: np.ndarray, + loss_rel_change_max: float, + count_iterations: int = 5, + view_fraction_per_iteration: float = 0.3, + fraction_remove_per_iteration: float = 0.1, +) -> np.ndarray: + """ + Minimization approach based on candidate loss contribution. + + This function iteratively removes households that contribute least to the loss, + maintaining the calibration quality within the specified tolerance. - return loss + Parameters + ---------- + weights : (n,) household weights + estimate_matrix : (n, m) matrix mapping weights to estimates + targets : (m,) calibration targets + normalisation_factor : (m,) normalisation factors for different targets + loss_rel_change_max : maximum allowed relative change in loss + count_iterations : number of iterations to perform + view_fraction_per_iteration : fraction of households to evaluate each iteration + fraction_remove_per_iteration : fraction of households to remove each iteration - COUNT_ITERATIONS = 5 - VIEW_FRACTION_PER_ITERATION = 0.3 - FRACTION_REMOVE_PER_ITERATION = 0.1 + Returns + ------- + inclusion_mask : (n,) boolean mask of households to keep + """ from tqdm import tqdm full_mask = np.ones_like(weights, dtype=bool) - for i in range(COUNT_ITERATIONS): + + for i in range(count_iterations): inclusion_mask = full_mask.copy() baseline_loss = get_loss_from_mask( - inclusion_mask, estimate_matrix, targets, normalisation_factor + weights, + inclusion_mask, + estimate_matrix, + targets, + normalisation_factor, ) - household_loss_rel_changes = [] - indices = np.random.choice( - np.arange(len(weights)), - size=int(len(weights) * VIEW_FRACTION_PER_ITERATION), - replace=False, - ) - - # more efficient approach to compute losses for candidate households to be removed - # 1. sample only households that are currently *included* + # Sample only households that are currently included indices = np.random.choice( np.where(full_mask)[0], - size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION), + size=int(full_mask.sum() * view_fraction_per_iteration), replace=False, ) - # 2. compute losses for the batch in one shot + + # Compute losses for the batch in one shot candidate_losses = losses_for_candidates( weights, indices, estimate_matrix, targets, normalisation_factor ) - # 3. convert to relative change vs. baseline + + # Convert to relative change vs. baseline household_loss_rel_changes = ( candidate_losses - baseline_loss ) / baseline_loss - inclusion_mask = full_mask.copy() - household_loss_rel_changes = np.array(household_loss_rel_changes) # Sort by the relative change in loss sorted_indices = np.argsort(household_loss_rel_changes) + # Remove the worst households - num_to_remove = int(len(weights) * FRACTION_REMOVE_PER_ITERATION) - worst_indices = sorted_indices[:num_to_remove] + num_to_remove = int(len(weights) * fraction_remove_per_iteration) + worst_indices = indices[sorted_indices[:num_to_remove]] inclusion_mask[worst_indices] = False + # Calculate the new loss new_loss = get_loss_from_mask( - inclusion_mask, estimate_matrix, targets, normalisation_factor + weights, + inclusion_mask, + estimate_matrix, + targets, + normalisation_factor, ) rel_change = (new_loss - baseline_loss) / baseline_loss + if rel_change > loss_rel_change_max: print( - f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, which is too high ({rel_change:.2%}). Stopping." + f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, " + f"which is too high ({rel_change:.2%}). Stopping." ) break + print( f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}" ) print( f"Removed {num_to_remove} households with worst relative loss changes." ) + # Update the full mask full_mask &= inclusion_mask - household_ids = sim.calculate("household_id", 2024).values - remaining_households = household_ids[full_mask] + return full_mask + + +def random_sampling_minimization( + weights, + estimate_matrix, + targets, + normalisation_factor, + target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5], +): + """A simple random sampling approach""" + n = len(weights) + + final_mask = None + lowest_loss = float("inf") + for fraction in target_fractions: + target_size = int(n * fraction) + # Random sampling with multiple attempts + best_mask = None + best_loss = float("inf") + + for _ in range(5): # Try 5 random samples + mask = np.zeros(n, dtype=bool) + mask[np.random.choice(n, target_size, replace=False)] = True + + loss = get_loss_from_mask( + weights, mask, estimate_matrix, targets, normalisation_factor + ) + + if loss < best_loss: + best_loss = loss + best_mask = mask + + if lowest_loss > best_loss: + lowest_loss = best_loss + final_mask = best_mask + + return final_mask + + +def minimise_dataset( + dataset, + output_path: str, + loss_rel_change_max: float, + minimization_function: Callable = candidate_loss_contribution, + **kwargs, +) -> None: + """ + Main function to minimize a dataset using a specified minimization approach. + + Parameters + ---------- + dataset : path to the dataset file or Dataset object + output_path : path where the minimized dataset will be saved + loss_rel_change_max : maximum allowed relative change in loss + minimization_function : function that implements the minimization logic + **kwargs : additional arguments to pass to the minimization function + """ + dataset = str(dataset) + create_calibration_log_file(dataset) + + dataset = Dataset.from_file(dataset) + loss_matrix = build_loss_matrix(dataset, 2024) - # At this point we have a mask of households to keep + sim = Microsimulation(dataset=dataset) - # I'm saving to a csv for ease of debugging, but we need to save to a .h5 file + weights = sim.calculate("household_weight", 2024).values + estimate_matrix, targets = loss_matrix + is_national = estimate_matrix.columns.str.startswith("nation/") + nation_normalisation_factor = is_national * (1 / is_national.sum()) + state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) + normalisation_factor = np.where( + is_national, nation_normalisation_factor, state_normalisation_factor + ) + + # Call the minimization function + inclusion_mask = minimization_function( + weights=weights, + estimate_matrix=estimate_matrix, + targets=targets, + normalisation_factor=normalisation_factor, + loss_rel_change_max=loss_rel_change_max, + **kwargs, + ) + + # Extract household IDs for remaining households + household_ids = sim.calculate("household_id", 2024).values + remaining_households = household_ids[inclusion_mask] + # Create a smaller dataset with only the remaining households df = sim.to_input_dataframe() smaller_df = df[df["household_id__2024"].isin(remaining_households)] @@ -208,27 +306,30 @@ def get_loss_from_mask( ) print(f"Weight relative change: {weight_rel_change:.2%}") + # Create new simulation with smaller dataset sim = Microsimulation(dataset=smaller_df) + # Rescale weights to maintain total sim.set_input( "household_weight", 2024, sim.calculate("household_weight", 2024).values / weight_rel_change, ) + # Prepare data for saving data = {} - for variable in sim.input_variables: data[variable] = {2024: sim.calculate(variable, 2024).values} if data[variable][2024].dtype == "object": data[variable][2024] = data[variable][2024].astype("S") + # Save to HDF5 file with h5py.File(output_path, "w") as f: for variable, values in data.items(): for year, value in values.items(): f.create_dataset(f"{variable}/{year}", data=value) - print(f"Saved minimised dataset to {output_path}") + print(f"Saved minimised dataset to {output_path}") create_calibration_log_file(output_path) @@ -240,4 +341,9 @@ def get_loss_from_mask( for file in files: output_path = file.with_name(file.stem + "_minimised.h5") - minimise_dataset(file, output_path, loss_rel_change_max=10) + minimise_dataset( + file, + output_path, + loss_rel_change_max=10, + minimization_function=candidate_loss_contribution, + ) From cd0776c0eb7d1745e987ace34ecc4b56306eee2b Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 11 Jul 2025 19:19:58 +0200 Subject: [PATCH 07/58] add notebook with testing functionality (havent tested locally) --- .../datasets/cps/enhanced_cps.py | 8 +- policyengine_us_data/utils/minimise.py | 2 +- test_minimization_approach.ipynb | 107 ++++++++++++++++++ 3 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 test_minimization_approach.ipynb diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index bf303f7a..08798622 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -28,6 +28,7 @@ def reweight( targets_array, dropout_rate=0.05, log_path="calibration_log.csv", + penalty_approach="l0_sigmoid", ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -46,7 +47,7 @@ def reweight( ) # TO DO: replace this with a call to the python reweight.py package. - def loss(weights, penalty_approach="l0_sigmoid"): + def loss(weights, penalty_approach=penalty_approach): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -85,6 +86,11 @@ def loss(weights, penalty_approach="l0_sigmoid"): smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() # L1 penalty + l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs + + if penalty_approach == "l1": + l1 = torch.mean(weights) + return rel_error_normalized.mean() + l1_penalty_weight * l1 return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 45212905..a9ba3959 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -330,7 +330,7 @@ def minimise_dataset( f.create_dataset(f"{variable}/{year}", data=value) print(f"Saved minimised dataset to {output_path}") - create_calibration_log_file(output_path) + create_calibration_log_file(output_path) if __name__ == "__main__": diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb new file mode 100644 index 00000000..519d2725 --- /dev/null +++ b/test_minimization_approach.ipynb @@ -0,0 +1,107 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "id": "d6dc9cca", + "metadata": {}, + "outputs": [], + "source": [ + "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n", + "from policyengine_us_data.storage import STORAGE_FOLDER\n", + "from policyengine_us import Microsimulation\n", + "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", + "from policyengine_us_data.utils import build_loss_matrix\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db975ac1", + "metadata": {}, + "outputs": [], + "source": [ + "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", + "\n", + "files = [\n", + " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", + " ]\n", + "\n", + "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n", + "minimization_function = random_sampling_minimization\n", + "# other minimization function approach is \"candidate_loss_contribution\"\n", + "\n", + "for file in files:\n", + " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + " minimise_dataset(\n", + " file,\n", + " output_path,\n", + " loss_rel_change_max=10,\n", + " minimization_function=minimization_function, \n", + " target_fractions=[0.5] # remove if switching approach\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35892c9d", + "metadata": {}, + "outputs": [], + "source": [ + "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", + "\n", + "input_dataset = ExtendedCPS_2024\n", + "\n", + "approach = \"l0_sigmoid\"\n", + "# other options are \"l0_log\", \"l0_exp\", \"l1\"\n", + "\n", + "sim = Microsimulation(dataset=input_dataset)\n", + "data = sim.dataset.load_dataset()\n", + "data[\"household_weight\"] = {}\n", + "original_weights = sim.calculate(\"household_weight\")\n", + "original_weights = original_weights.values + np.random.normal(\n", + " 1, 0.1, len(original_weights)\n", + ")\n", + "for year in range(2024, 2025):\n", + " loss_matrix, targets_array = build_loss_matrix(\n", + " input_dataset, year\n", + " )\n", + " optimised_weights = reweight(\n", + " original_weights,\n", + " loss_matrix,\n", + " targets_array,\n", + " log_path= STORAGE_FOLDER / approach / \"calibration_log.csv\",\n", + " penalty_approach=approach,\n", + " )\n", + " data[\"household_weight\"][year] = optimised_weights\n", + "\n", + "output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + "\n", + "data.save_dataset(output_path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pe", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 2c050fc973ba312d070c27dcb7f1fb049e1e2af2 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 11 Jul 2025 19:20:55 +0200 Subject: [PATCH 08/58] lint --- policyengine_us_data/utils/minimise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index a9ba3959..45212905 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -330,7 +330,7 @@ def minimise_dataset( f.create_dataset(f"{variable}/{year}", data=value) print(f"Saved minimised dataset to {output_path}") - create_calibration_log_file(output_path) + create_calibration_log_file(output_path) if __name__ == "__main__": From ee98fc36ab920d571982862dc48d950b7a58ec3d Mon Sep 17 00:00:00 2001 From: eccuraa Date: Fri, 11 Jul 2025 20:06:32 -0400 Subject: [PATCH 09/58] debugged 2nd cell: created path & removed optional parameters. --- policyengine_us_data/utils/minimise.py | 8 +- test_minimization_approach.ipynb | 219 +++++++++++++++++++++++-- 2 files changed, 210 insertions(+), 17 deletions(-) diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 45212905..e84e1bee 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -250,10 +250,11 @@ def random_sampling_minimization( def minimise_dataset( dataset, output_path: str, - loss_rel_change_max: float, minimization_function: Callable = candidate_loss_contribution, **kwargs, ) -> None: + #loss_rel_change_max = kwargs.pop('loss_rel_change_max', 10.0) + """ Main function to minimize a dataset using a specified minimization approach. @@ -288,8 +289,7 @@ def minimise_dataset( estimate_matrix=estimate_matrix, targets=targets, normalisation_factor=normalisation_factor, - loss_rel_change_max=loss_rel_change_max, - **kwargs, + **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. ) # Extract household IDs for remaining households @@ -344,6 +344,4 @@ def minimise_dataset( minimise_dataset( file, output_path, - loss_rel_change_max=10, - minimization_function=candidate_loss_contribution, ) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 519d2725..8400d4fe 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -12,15 +12,188 @@ "from policyengine_us import Microsimulation\n", "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", - "import numpy as np" + "import numpy as np\n", + "import os\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "db975ac1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n", + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n", + "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n", + "Removed 4131 households with worst relative loss changes.\n", + "Weight relative change: 52.19%\n", + "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n" + ] + } + ], "source": [ "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", "\n", @@ -28,27 +201,49 @@ " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", " ]\n", "\n", - "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n", - "minimization_function = random_sampling_minimization\n", - "# other minimization function approach is \"candidate_loss_contribution\"\n", + "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n", + "minimization_function = candidate_loss_contribution\n", + "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n", "\n", "for file in files:\n", " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + "\n", " minimise_dataset(\n", " file,\n", " output_path,\n", " loss_rel_change_max=10,\n", " minimization_function=minimization_function, \n", - " target_fractions=[0.5] # remove if switching approach\n", + " #target_fractions=[0.5] # remove if switching approach\n", " )" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "35892c9d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ValueError", + "evalue": "Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m approach \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ml0_sigmoid\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# other options are \"l0_log\", \"l0_exp\", \"l1\"\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_dataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 9\u001b[0m data \u001b[38;5;241m=\u001b[39m sim\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39mload_dataset()\n\u001b[1;32m 10\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {}\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 215\u001b[0m dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m 217\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m 218\u001b[0m ):\n\u001b[1;32m 219\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m 225\u001b[0m )\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m 180\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m 181\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 182\u001b[0m )\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m 187\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 188\u001b[0m )\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:103\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload()\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 103\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/extended_cps.py:147\u001b[0m, in \u001b[0;36mExtendedCPS.generate\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Microsimulation\n\u001b[1;32m 146\u001b[0m cps_sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcps)\n\u001b[0;32m--> 147\u001b[0m puf_sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpuf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 149\u001b[0m puf_sim\u001b[38;5;241m.\u001b[39msubsample(\u001b[38;5;241m10_000\u001b[39m)\n\u001b[1;32m 151\u001b[0m INPUTS \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 152\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mage\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 153\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_male\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_tax_unit_dependent\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 159\u001b[0m ]\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 215\u001b[0m dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m 217\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m 218\u001b[0m ):\n\u001b[1;32m 219\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m 225\u001b[0m )\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m 180\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m 181\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 182\u001b[0m )\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m 187\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 188\u001b[0m )\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:101\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexists \u001b[38;5;129;01mand\u001b[39;00m require:\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39murl \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgenerate()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:343\u001b[0m, in \u001b[0;36mDataset.download\u001b[0;34m(self, url, version)\u001b[0m\n\u001b[1;32m 341\u001b[0m response \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mget(url, headers\u001b[38;5;241m=\u001b[39mauth_headers)\n\u001b[1;32m 342\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[0;32m--> 343\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 344\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid response code \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for url \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 345\u001b[0m )\n\u001b[1;32m 346\u001b[0m assets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massets\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 347\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m asset \u001b[38;5;129;01min\u001b[39;00m assets:\n", + "\u001b[0;31mValueError\u001b[0m: Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0." + ] + } + ], "source": [ "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", "\n", @@ -85,7 +280,7 @@ ], "metadata": { "kernelspec": { - "display_name": "pe", + "display_name": "policyengine-us-data", "language": "python", "name": "python3" }, @@ -99,7 +294,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.11" + "version": "3.11.13" } }, "nbformat": 4, From f6d7f0fa00f158f099c2dc15116fac4987d33085 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Mon, 14 Jul 2025 15:22:58 +0200 Subject: [PATCH 10/58] few updates to the testing framework --- changelog_entry.yaml | 2 +- .../datasets/cps/enhanced_cps.py | 78 +++++++++++++------ policyengine_us_data/utils/minimise.py | 75 +++++++++++++----- pyproject.toml | 4 +- test_minimization_approach.ipynb | 75 +++++++++--------- 5 files changed, 149 insertions(+), 85 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 84eeb584..ac664753 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,4 @@ - bump: minor changes: added: - - Enhanced CPS minimizing tests. \ No newline at end of file + - Enhanced CPS minimizing tests. \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 08798622..6ad510f3 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -22,13 +22,25 @@ torch = None +bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", +] + + def reweight( original_weights, loss_matrix, targets_array, dropout_rate=0.05, log_path="calibration_log.csv", - penalty_approach="l0_sigmoid", + penalty_approach=None, ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -64,35 +76,43 @@ def loss(weights, penalty_approach=penalty_approach): if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") - # L0 penalty (approximated with smooth function) - # Since L0 is non-differentiable, we use a smooth approximation - # Common approaches: + if penalty_approach is not None: + # L0 penalty (approximated with smooth function) + # Since L0 is non-differentiable, we use a smooth approximation + # Common approaches: + + epsilon = 1e-3 # Threshold for "near zero" + l0_penalty_weight = 1e-1 # Adjust this hyperparameter - epsilon = 1e-3 # Threshold for "near zero" - l0_penalty_weight = 1e-1 # Adjust this hyperparameter + # Option 1: Sigmoid approximation + if penalty_approach == "l0_sigmoid": + smoothed_l0 = torch.sigmoid( + (weights - epsilon) / (epsilon * 0.1) + ).mean() - # Option 1: Sigmoid approximation - if penalty_approach == "l0_sigmoid": - smoothed_l0 = torch.sigmoid( - (weights - epsilon) / (epsilon * 0.1) - ).mean() + # Option 2: Log-sum penalty (smoother) + if penalty_approach == "l0_log": + smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len( + weights + ) - # Option 2: Log-sum penalty (smoother) - if penalty_approach == "l0_log": - smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights) + # Option 3: Exponential penalty + if penalty_approach == "l0_exp": + smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() - # Option 3: Exponential penalty - if penalty_approach == "l0_exp": - smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() + # L1 penalty + l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs - # L1 penalty - l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs + if penalty_approach == "l1": + l1 = torch.mean(weights) + return rel_error_normalized.mean() + l1_penalty_weight * l1 - if penalty_approach == "l1": - l1 = torch.mean(weights) - return rel_error_normalized.mean() + l1_penalty_weight * l1 + return ( + rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 + ) - return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 + else: + return rel_error_normalized.mean() def dropout_weights(weights, p): if p == 0: @@ -213,10 +233,18 @@ def generate(self): loss_matrix, targets_array = build_loss_matrix( self.input_dataset, year ) + + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_array_clean = targets_array[keep_idx] + assert loss_matrix_clean.shape[1] == targets_array_clean.size + optimised_weights = reweight( original_weights, - loss_matrix, - targets_array, + loss_matrix_clean, + targets_array_clean, log_path="calibration_log.csv", ) data["household_weight"][year] = optimised_weights diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index e84e1bee..df193c6e 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -7,30 +7,53 @@ from policyengine_us_data.storage import STORAGE_FOLDER from typing import Optional, Callable - -def create_calibration_log_file(file_path): +bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", +] + + +def create_calibration_log_file(file_path, epoch=0): dataset = Dataset.from_file(file_path) - loss_matrix = build_loss_matrix(dataset, 2024) + loss_matrix, targets = build_loss_matrix(dataset, 2024) + + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + assert loss_matrix_clean.shape[1] == targets_clean.size sim = Microsimulation(dataset=dataset) - estimates = sim.calculate("household_weight", 2024).values @ loss_matrix[0] - target_names = loss_matrix[0].columns - target_values = loss_matrix[1] + estimates = ( + sim.calculate("household_weight", 2024).values @ loss_matrix_clean + ) + target_names = loss_matrix_clean.columns df = pd.DataFrame( { "target_name": target_names, "estimate": estimates, - "target": target_values, + "target": targets_clean, } ) - df["epoch"] = 0 + df["epoch"] = epoch df["error"] = df["estimate"] - df["target"] df["rel_error"] = df["error"] / df["target"] df["abs_error"] = df["error"].abs() - df["rel_abs_error"] = df["abs_error"] / df["target"].abs() + df["rel_abs_error"] = ( + df["abs_error"] / df["target"].abs() + if df["target"].abs().sum() > 0 + else np.nan + ) df["loss"] = (df["rel_error"] ** 2).mean() df.to_csv( @@ -215,11 +238,14 @@ def random_sampling_minimization( estimate_matrix, targets, normalisation_factor, + random=True, target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5], ): """A simple random sampling approach""" n = len(weights) + household_weights_normalized = weights / weights.sum() + final_mask = None lowest_loss = float("inf") for fraction in target_fractions: @@ -230,7 +256,14 @@ def random_sampling_minimization( for _ in range(5): # Try 5 random samples mask = np.zeros(n, dtype=bool) - mask[np.random.choice(n, target_size, replace=False)] = True + mask[ + np.random.choice( + n, + target_size, + p=household_weights_normalized if random else None, + replace=False, + ) + ] = True loss = get_loss_from_mask( weights, mask, estimate_matrix, targets, normalisation_factor @@ -253,8 +286,6 @@ def minimise_dataset( minimization_function: Callable = candidate_loss_contribution, **kwargs, ) -> None: - #loss_rel_change_max = kwargs.pop('loss_rel_change_max', 10.0) - """ Main function to minimize a dataset using a specified minimization approach. @@ -270,13 +301,19 @@ def minimise_dataset( create_calibration_log_file(dataset) dataset = Dataset.from_file(dataset) - loss_matrix = build_loss_matrix(dataset, 2024) + loss_matrix, targets = build_loss_matrix(dataset, 2024) + + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + assert loss_matrix_clean.shape[1] == targets_clean.size sim = Microsimulation(dataset=dataset) weights = sim.calculate("household_weight", 2024).values - estimate_matrix, targets = loss_matrix - is_national = estimate_matrix.columns.str.startswith("nation/") + is_national = loss_matrix_clean.columns.str.startswith("nation/") nation_normalisation_factor = is_national * (1 / is_national.sum()) state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) normalisation_factor = np.where( @@ -286,10 +323,10 @@ def minimise_dataset( # Call the minimization function inclusion_mask = minimization_function( weights=weights, - estimate_matrix=estimate_matrix, - targets=targets, + estimate_matrix=loss_matrix_clean, + targets=targets_clean, normalisation_factor=normalisation_factor, - **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. + **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. ) # Extract household IDs for remaining households @@ -330,7 +367,7 @@ def minimise_dataset( f.create_dataset(f"{variable}/{year}", data=value) print(f"Saved minimised dataset to {output_path}") - create_calibration_log_file(output_path) + create_calibration_log_file(output_path, epoch=500) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 0352db69..65d1ca8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us>=1.197.0", - "policyengine-core>=3.14.1", + "policyengine-us>=1.340.0", + "policyengine-core>=3.17.1", "requests", "tqdm", "microdf_python>=0.4.3", diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 8400d4fe..54f3c6fa 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -13,12 +13,24 @@ "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", "import numpy as np\n", - "import os\n" + "import os" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, + "id": "6daabe7c", + "metadata": {}, + "outputs": [], + "source": [ + "# Original ECPS 2024 dataset size (for household entity): 41310\n", + "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n", + "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "id": "db975ac1", "metadata": {}, "outputs": [ @@ -128,18 +140,17 @@ "Targeting Medicaid enrollment for WI with target 1108320k\n", "Targeting Medicaid enrollment for WV with target 467632k\n", "Targeting Medicaid enrollment for WY with target 57320k\n", - "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n", + "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n", + "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n", + "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n", + "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Weight relative change: 52.19%\n", - "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", + "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n", + "Weight relative change: 99.10%\n", + "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", "Targeting Medicaid enrollment for AK with target 231577k\n", "Targeting Medicaid enrollment for AL with target 766009k\n", "Targeting Medicaid enrollment for AR with target 733561k\n", @@ -203,7 +214,7 @@ "\n", "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n", "minimization_function = candidate_loss_contribution\n", - "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n", + "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", "\n", "for file in files:\n", " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", @@ -212,38 +223,18 @@ " minimise_dataset(\n", " file,\n", " output_path,\n", - " loss_rel_change_max=10,\n", " minimization_function=minimization_function, \n", - " #target_fractions=[0.5] # remove if switching approach\n", + " # target_fractions=[0.5] # remove if switching approach\n", + " loss_rel_change_max=0.0001, # remove if switching approach\n", " )" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "35892c9d", "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[5], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m approach \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ml0_sigmoid\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# other options are \"l0_log\", \"l0_exp\", \"l1\"\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_dataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 9\u001b[0m data \u001b[38;5;241m=\u001b[39m sim\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39mload_dataset()\n\u001b[1;32m 10\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {}\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 215\u001b[0m dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m 217\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m 218\u001b[0m ):\n\u001b[1;32m 219\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m 225\u001b[0m )\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m 180\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m 181\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 182\u001b[0m )\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m 187\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 188\u001b[0m )\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:103\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload()\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 103\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/extended_cps.py:147\u001b[0m, in \u001b[0;36mExtendedCPS.generate\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Microsimulation\n\u001b[1;32m 146\u001b[0m cps_sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcps)\n\u001b[0;32m--> 147\u001b[0m puf_sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpuf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 149\u001b[0m puf_sim\u001b[38;5;241m.\u001b[39msubsample(\u001b[38;5;241m10_000\u001b[39m)\n\u001b[1;32m 151\u001b[0m INPUTS \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 152\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mage\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 153\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_male\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_tax_unit_dependent\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 159\u001b[0m ]\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 215\u001b[0m dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m 217\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m 218\u001b[0m ):\n\u001b[1;32m 219\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m 225\u001b[0m )\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m 180\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m 181\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 182\u001b[0m )\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m 187\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 188\u001b[0m )\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:101\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexists \u001b[38;5;129;01mand\u001b[39;00m require:\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39murl \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgenerate()\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:343\u001b[0m, in \u001b[0;36mDataset.download\u001b[0;34m(self, url, version)\u001b[0m\n\u001b[1;32m 341\u001b[0m response \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mget(url, headers\u001b[38;5;241m=\u001b[39mauth_headers)\n\u001b[1;32m 342\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[0;32m--> 343\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 344\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid response code \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for url \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 345\u001b[0m )\n\u001b[1;32m 346\u001b[0m assets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massets\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 347\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m asset \u001b[38;5;129;01min\u001b[39;00m assets:\n", - "\u001b[0;31mValueError\u001b[0m: Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0." - ] - } - ], + "outputs": [], "source": [ "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", "\n", @@ -276,11 +267,19 @@ "\n", "data.save_dataset(output_path)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4cf8e89", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "policyengine-us-data", + "display_name": "pe", "language": "python", "name": "python3" }, @@ -294,7 +293,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.13" + "version": "3.11.11" } }, "nbformat": 4, From a042a01f7826997d0ac99b330183b80cfee167df Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 13:44:20 -0400 Subject: [PATCH 11/58] added CPS_2023 to lite mode generation --- changelog_entry.yaml | 6 +++--- policyengine_us_data/datasets/cps/cps.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index ac664753..dcce3f1a 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,4 @@ -- bump: minor +- bump: patch changes: - added: - - Enhanced CPS minimizing tests. \ No newline at end of file + changed: + - lite mode now builds CPS_2023 in addition to CPS_2024 diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 3b976a31..fde981ba 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2006,6 +2006,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if __name__ == "__main__": if test_lite: + CPS_2023().generate() CPS_2024().generate() else: CPS_2021().generate() From cabeb56c7a1fe926eaf4c5aa5ecd26f45df3043f Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 14:54:23 -0400 Subject: [PATCH 12/58] Fixed manual test --- .github/workflows/code_changes.yaml | 1 + .github/workflows/manual_tests.yaml | 2 +- policyengine_us_data/datasets/cps/cps.py | 1 + pyproject.toml | 4 ++-- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index 6b474227..edd804db 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -2,6 +2,7 @@ name: Code changes on: + workflow_call: push: branches: - main diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml index a2daca18..fb13ba89 100644 --- a/.github/workflows/manual_tests.yaml +++ b/.github/workflows/manual_tests.yaml @@ -11,7 +11,7 @@ on: jobs: test: - uses: ./.github/workflows/pr_changelog.yaml + uses: ./.github/workflows/code_changes.yaml with: TEST_LITE: ${{ github.event.inputs.test_lite }} secrets: inherit diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index fde981ba..177f4707 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2008,6 +2008,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() + print(2 + 2) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index 65d1ca8e..3490ff1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us>=1.340.0", - "policyengine-core>=3.17.1", + "policyengine-us>=1.333.0", + "policyengine-core>=3.14.1", "requests", "tqdm", "microdf_python>=0.4.3", From 7b76afba9eb55c3d2588c1ba5c6683a48e3709f7 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 15:02:22 -0400 Subject: [PATCH 13/58] try again with locked version --- policyengine_us_data/datasets/cps/cps.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 177f4707..09a594c3 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2008,7 +2008,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 2) + print(2 + 3) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index 3490ff1b..74af05bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us>=1.333.0", + "policyengine-us==1.333.0", "policyengine-core>=3.14.1", "requests", "tqdm", From 4056df4762b5d5e98ff6da815eae8de1484a4c25 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 15:44:32 -0400 Subject: [PATCH 14/58] trying things --- policyengine_us_data/datasets/cps/cps.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 09a594c3..1edce6e9 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -100,9 +100,14 @@ def downsample(self, frac: float): original_dtypes = { key: original_data[key].dtype for key in original_data } - + print("\n\nHERE IS THE PROBLEM-----") + print(f"frac is {frac}") + print(self) + print(Microsimulation) sim = Microsimulation(dataset=self) - sim.subsample(frac=frac) + print(sim) + print(sim.subsample) + #sim.subsample(frac=frac) for key in original_data: if key not in sim.tax_benefit_system.variables: From 96c4c25b71b5e148059be66a28805ad41c8cc28b Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 15:45:47 -0400 Subject: [PATCH 15/58] lint --- policyengine_us_data/datasets/cps/cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 1edce6e9..30688719 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -107,7 +107,7 @@ def downsample(self, frac: float): sim = Microsimulation(dataset=self) print(sim) print(sim.subsample) - #sim.subsample(frac=frac) + # sim.subsample(frac=frac) for key in original_data: if key not in sim.tax_benefit_system.variables: From e20c75c202531e72fd118107c40fa10a0cda6e79 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:05:26 -0400 Subject: [PATCH 16/58] trying 3.11.12 --- policyengine_us_data/datasets/cps/cps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 30688719..8219e915 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -107,7 +107,7 @@ def downsample(self, frac: float): sim = Microsimulation(dataset=self) print(sim) print(sim.subsample) - # sim.subsample(frac=frac) + sim.subsample(frac=frac) for key in original_data: if key not in sim.tax_benefit_system.variables: @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 3) + print(2 + 5) else: CPS_2021().generate() CPS_2022().generate() From 776eda8ce513f7e1b845cb8212abd17301e46c73 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:10:26 -0400 Subject: [PATCH 17/58] now actually specifying py version --- policyengine_us_data/datasets/cps/cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 8219e915..a25aba26 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 5) + print(2 + 7) else: CPS_2021().generate() CPS_2022().generate() From cd771794473e0bb1f5005e7d6c598d8c1bc2a112 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:33:21 -0400 Subject: [PATCH 18/58] pandas v --- policyengine_us_data/datasets/cps/cps.py | 2 +- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index a25aba26..b3554604 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 7) + print(2 + 8) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index 74af05bf..6c767ede 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ requires-python = ">=3.11, <3.13.0" dependencies = [ "policyengine-us==1.333.0", "policyengine-core>=3.14.1", + "pandas==2.3.1", "requests", "tqdm", "microdf_python>=0.4.3", From d0ce44db56b066e4d370bc434fba08435f65e01f Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:47:12 -0400 Subject: [PATCH 19/58] small runner --- .github/workflows/pr_code_changes.yaml | 2 +- policyengine_us_data/datasets/cps/cps.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 213d192f..385e5a4c 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -49,7 +49,7 @@ jobs: run: python -c "from policyengine_core.data import Dataset; print('Core import OK')" Test: - runs-on: larger-runner + runs-on: ubuntu-latest needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index b3554604..027c2ef5 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 8) + print(2 + 0) else: CPS_2021().generate() CPS_2022().generate() From eb96cd5f706b0b718c39e36fa4fd1854bb3e3b0d Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:53:57 -0400 Subject: [PATCH 20/58] trying everything --- .github/workflows/pr_code_changes.yaml | 2 +- policyengine_us_data/datasets/cps/cps.py | 2 +- pyproject.toml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 385e5a4c..02209591 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -63,7 +63,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.11' + python-version: '3.11.12' - name: Install package run: uv pip install -e .[dev] --system diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 027c2ef5..afbf223f 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 0) + print(2 + 9) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index 6c767ede..d87290a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,11 +15,11 @@ authors = [ {name = "PolicyEngine", email = "hello@policyengine.org"}, ] license = {file = "LICENSE"} -requires-python = ">=3.11, <3.13.0" +requires-python = ">=3.11, <3.11.13" dependencies = [ "policyengine-us==1.333.0", "policyengine-core>=3.14.1", - "pandas==2.3.1", + "pandas==2.3.0", "requests", "tqdm", "microdf_python>=0.4.3", From 59ff94e82cd4dbd0aba16b488fd0b8ec16ca5531 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 17:02:45 -0400 Subject: [PATCH 21/58] relaxing python version in pyproject.toml --- policyengine_us_data/datasets/cps/cps.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index afbf223f..3173d4d6 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 9) + print(3 + 9) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index d87290a2..fe5fda52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ authors = [ {name = "PolicyEngine", email = "hello@policyengine.org"}, ] license = {file = "LICENSE"} -requires-python = ">=3.11, <3.11.13" +requires-python = ">=3.11, <3.13.0" dependencies = [ "policyengine-us==1.333.0", "policyengine-core>=3.14.1", From d3fa67bf98762b48c6fe2397275c1d0aac2ff77b Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 17:29:12 -0400 Subject: [PATCH 22/58] putting things back in order. --- policyengine_us_data/datasets/cps/cps.py | 7 ------- pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 3173d4d6..d9957cbb 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -100,13 +100,7 @@ def downsample(self, frac: float): original_dtypes = { key: original_data[key].dtype for key in original_data } - print("\n\nHERE IS THE PROBLEM-----") - print(f"frac is {frac}") - print(self) - print(Microsimulation) sim = Microsimulation(dataset=self) - print(sim) - print(sim.subsample) sim.subsample(frac=frac) for key in original_data: @@ -2013,7 +2007,6 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(3 + 9) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index fe5fda52..4bec19eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us==1.333.0", + "policyengine-us==1.340.1", "policyengine-core>=3.14.1", "pandas==2.3.0", "requests", From 273c48d7bc9db1d6f06fa859897b63c30d37b044 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Sat, 12 Jul 2025 13:01:15 +0100 Subject: [PATCH 23/58] Use normal runner in PR tests --- .github/workflows/pr_code_changes.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 02209591..c84a4b97 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -49,7 +49,7 @@ jobs: run: python -c "from policyengine_core.data import Dataset; print('Core import OK')" Test: - runs-on: ubuntu-latest + runs-on: ubuntu-latest needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} From 8c2fbda847e9945878afa4085476f56895c360f1 Mon Sep 17 00:00:00 2001 From: baogorek Date: Sat, 12 Jul 2025 09:53:07 -0400 Subject: [PATCH 24/58] added the 3.11.12 pin --- .github/workflows/code_changes.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index edd804db..c2340d14 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -38,7 +38,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.11' + python-version: '3.11.12' - uses: "google-github-actions/auth@v2" with: workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider" From edb09456bb8548b8b4eb94136122ab5a5b33586e Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 20:00:50 -0400 Subject: [PATCH 25/58] cps.py --- policyengine_us_data/datasets/cps/cps.py | 1 + 1 file changed, 1 insertion(+) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index d9957cbb..202f9c69 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2007,6 +2007,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() + print(3) else: CPS_2021().generate() CPS_2022().generate() From 994ac15a636b99f951e205ecb3a861e72cdc3472 Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 20:32:26 -0400 Subject: [PATCH 26/58] adding diagnostics --- .../datasets/cps/enhanced_cps.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 6ad510f3..17d3e862 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -249,6 +249,23 @@ def generate(self): ) data["household_weight"][year] = optimised_weights + print("\n\n---reweighting quick diagnostics----\n") + estimate = optimised_weights @ loss_matrix + rel_error = ( + ((estimate - targets_array) + 1) / (targets_array + 1) + ) ** 2 + print( + f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}", + f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}" + ) + print("Relative error over 100% for:") + for i in np.where(rel_error > 1)[0]: + print(f"target_name: {loss_matrix.columns[i]}") + print(f"target_value: {targets_array[i]}") + print(f"estimate_value: {estimate[i]}") + print(f"has rel_error: {rel_error.values[i]:.2f}\n") + print("---End of reweighting quick diagnostics------") + self.save_dataset(data) From 341a3559f4368f65947db8f0ebe4db67e39a671c Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 20:32:47 -0400 Subject: [PATCH 27/58] lint --- policyengine_us_data/datasets/cps/enhanced_cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 17d3e862..0da67ceb 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -256,7 +256,7 @@ def generate(self): ) ** 2 print( f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}", - f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}" + f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}", ) print("Relative error over 100% for:") for i in np.where(rel_error > 1)[0]: From c2ab4b6466de68c8970ac859157bc941fc56287b Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 22:27:46 -0400 Subject: [PATCH 28/58] taking out bad targets --- policyengine_us_data/datasets/cps/cps.py | 1 - .../datasets/cps/enhanced_cps.py | 59 +++++++++++++++++-- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 202f9c69..d9957cbb 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2007,7 +2007,6 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(3) else: CPS_2021().generate() CPS_2022().generate() diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 0da67ceb..e7a57044 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -40,7 +40,7 @@ def reweight( targets_array, dropout_rate=0.05, log_path="calibration_log.csv", - penalty_approach=None, + epochs=150, ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -58,8 +58,8 @@ def reweight( np.log(original_weights), requires_grad=True, dtype=torch.float32 ) - # TO DO: replace this with a call to the python reweight.py package. - def loss(weights, penalty_approach=penalty_approach): + # TODO: replace this functionality from the microcalibrate package. + def loss(weights): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -129,7 +129,7 @@ def dropout_weights(weights, p): start_loss = None - iterator = trange(500) + iterator = trange(epochs) performance = pd.DataFrame() for i in iterator: optimizer.zero_grad() @@ -229,13 +229,37 @@ def generate(self): original_weights = original_weights.values + np.random.normal( 1, 0.1, len(original_weights) ) + + bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "state/RI/adjusted_gross_income/amount/-inf_1", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "state/RI/adjusted_gross_income/amount/-inf_1", + "target_name: nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", + ] + + # Run the optimization procedure to get (close to) minimum loss weights for year in range(self.start_year, self.end_year + 1): loss_matrix, targets_array = build_loss_matrix( self.input_dataset, year ) - + zero_mask = np.isclose(targets_array, 0.0, atol=0.1) bad_mask = loss_matrix.columns.isin(bad_targets) - keep_mask_bool = ~bad_mask + keep_mask_bool = ~(zero_mask | bad_mask) keep_idx = np.where(keep_mask_bool)[0] loss_matrix_clean = loss_matrix.iloc[:, keep_idx] targets_array_clean = targets_array[keep_idx] @@ -245,10 +269,33 @@ def generate(self): original_weights, loss_matrix_clean, targets_array_clean, + loss_matrix_clean, + targets_array_clean, log_path="calibration_log.csv", + epochs=150, ) data["household_weight"][year] = optimised_weights + print("\n\n---reweighting quick diagnostics----\n") + estimate = optimised_weights @ loss_matrix_clean + rel_error = ( + ((estimate - targets_array_clean) + 1) + / (targets_array_clean + 1) + ) ** 2 + print( + f"rel_error: min: {np.min(rel_error):.2f}, " + f"max: {np.max(rel_error):.2f} " + f"mean: {np.mean(rel_error):.2f}, " + f"median: {np.median(rel_error):.2f}" + ) + print("Relative error over 100% for:") + for i in np.where(rel_error > 1)[0]: + print(f"target_name: {loss_matrix_clean.columns[i]}") + print(f"target_value: {targets_array_clean[i]}") + print(f"estimate_value: {estimate[i]}") + print(f"has rel_error: {rel_error[i]:.2f}\n") + print("---End of reweighting quick diagnostics------") + print("\n\n---reweighting quick diagnostics----\n") estimate = optimised_weights @ loss_matrix rel_error = ( From 6f7a03a76dc95d7f9ebfd20f1df6240bd11593bc Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:05:09 -0400 Subject: [PATCH 29/58] fixing workflow arg passthrough --- .github/workflows/pr_code_changes.yaml | 16 +++++++++++++--- changelog_entry.yaml | 6 ++++++ pyproject.toml | 4 ++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index c84a4b97..56224a2e 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -11,6 +11,14 @@ on: - tests/** - .github/workflows/** + workflow_call: + inputs: + TEST_LITE: + description: 'Run in lite mode' + type: boolean + required: false + default: false + jobs: Lint: runs-on: ubuntu-latest @@ -53,6 +61,7 @@ jobs: needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + TEST_LITE: ${{ inputs.TEST_LITE }} steps: - name: Checkout repo uses: actions/checkout@v2 @@ -63,7 +72,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.11.12' + python-version: '3.11' - name: Install package run: uv pip install -e .[dev] --system @@ -75,8 +84,9 @@ jobs: - name: Build datasets run: make data env: - TEST_LITE: true - PYTHON_LOG_LEVEL: INFO + TEST_LITE: ${{ env.TEST_LITE }} + PYTHON_LOG_LEVEL: INFO + - name: Save calibration log uses: actions/upload-artifact@v4 with: diff --git a/changelog_entry.yaml b/changelog_entry.yaml index dcce3f1a..bce8b349 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,10 @@ - bump: patch changes: changed: + - bad targets (causing problems with estimation) removed - lite mode now builds CPS_2023 in addition to CPS_2024 + - gave reweight an epochs argument and set it at 150 for optimization + - updating minimum versions on policyengine-us and pandas dependencies + fixed: + - manual workflow now can call PR code changes + diff --git a/pyproject.toml b/pyproject.toml index 4bec19eb..481cbc37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,9 +17,9 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us==1.340.1", + "policyengine-us>=1.340.1", "policyengine-core>=3.14.1", - "pandas==2.3.0", + "pandas>=2.3.0", "requests", "tqdm", "microdf_python>=0.4.3", From 3dba2a2aa3a578aeaa7e7acde71e53d150669036 Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:09:32 -0400 Subject: [PATCH 30/58] deps and defaults --- .github/workflows/code_changes.yaml | 2 +- .github/workflows/pr_code_changes.yaml | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index c2340d14..edd804db 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -38,7 +38,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.11.12' + python-version: '3.11' - uses: "google-github-actions/auth@v2" with: workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider" diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 56224a2e..1e05b564 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -17,7 +17,7 @@ on: description: 'Run in lite mode' type: boolean required: false - default: false + default: true jobs: Lint: diff --git a/pyproject.toml b/pyproject.toml index 481cbc37..f983258d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ "policyengine-us>=1.340.1", - "policyengine-core>=3.14.1", + "policyengine-core>=3.17.1", "pandas>=2.3.0", "requests", "tqdm", From 7710a4cd0f58de7b2120f146228977e9c46f253d Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:12:21 -0400 Subject: [PATCH 31/58] wrong pipeline for manual test --- .github/workflows/manual_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml index fb13ba89..fd6fa061 100644 --- a/.github/workflows/manual_tests.yaml +++ b/.github/workflows/manual_tests.yaml @@ -11,7 +11,7 @@ on: jobs: test: - uses: ./.github/workflows/code_changes.yaml + uses: ./.github/workflows/pr_code_changes.yaml with: TEST_LITE: ${{ github.event.inputs.test_lite }} secrets: inherit From 27f46fd8d19199fad6006675bcab231da67968af Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:30:46 -0400 Subject: [PATCH 32/58] trying again to get the manual test to work --- .github/workflows/manual_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml index fd6fa061..55667dbc 100644 --- a/.github/workflows/manual_tests.yaml +++ b/.github/workflows/manual_tests.yaml @@ -13,5 +13,5 @@ jobs: test: uses: ./.github/workflows/pr_code_changes.yaml with: - TEST_LITE: ${{ github.event.inputs.test_lite }} + TEST_LITE: ${{ inputs.test_lite }} secrets: inherit From fef1eca57d99d8359f335ac4886eebde5b45c6c9 Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:53:27 -0400 Subject: [PATCH 33/58] reverting to older workflow code --- .github/workflows/manual_tests.yaml | 17 ----------------- .github/workflows/pr_code_changes.yaml | 14 ++------------ changelog_entry.yaml | 4 +--- 3 files changed, 3 insertions(+), 32 deletions(-) delete mode 100644 .github/workflows/manual_tests.yaml diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml deleted file mode 100644 index 55667dbc..00000000 --- a/.github/workflows/manual_tests.yaml +++ /dev/null @@ -1,17 +0,0 @@ -name: Manual tests - -on: - workflow_dispatch: - inputs: - test_lite: - description: 'Run in lite mode' - required: true - default: true - type: boolean - -jobs: - test: - uses: ./.github/workflows/pr_code_changes.yaml - with: - TEST_LITE: ${{ inputs.test_lite }} - secrets: inherit diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 1e05b564..4e30d089 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -11,14 +11,6 @@ on: - tests/** - .github/workflows/** - workflow_call: - inputs: - TEST_LITE: - description: 'Run in lite mode' - type: boolean - required: false - default: true - jobs: Lint: runs-on: ubuntu-latest @@ -61,7 +53,6 @@ jobs: needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - TEST_LITE: ${{ inputs.TEST_LITE }} steps: - name: Checkout repo uses: actions/checkout@v2 @@ -84,9 +75,8 @@ jobs: - name: Build datasets run: make data env: - TEST_LITE: ${{ env.TEST_LITE }} - PYTHON_LOG_LEVEL: INFO - + TEST_LITE: true + PYTHON_LOG_LEVEL: INFO - name: Save calibration log uses: actions/upload-artifact@v4 with: diff --git a/changelog_entry.yaml b/changelog_entry.yaml index bce8b349..3f9b8627 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -5,6 +5,4 @@ - lite mode now builds CPS_2023 in addition to CPS_2024 - gave reweight an epochs argument and set it at 150 for optimization - updating minimum versions on policyengine-us and pandas dependencies - fixed: - - manual workflow now can call PR code changes - + - getting rid of non-working manual workflow code From 5eb10501cd4e8f33925411de7f4574e3dec413f8 Mon Sep 17 00:00:00 2001 From: baogorek Date: Mon, 14 Jul 2025 00:12:37 -0400 Subject: [PATCH 34/58] cleaning up enhanced_cps.py --- .../datasets/cps/enhanced_cps.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index e7a57044..5c82d724 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -249,7 +249,7 @@ def generate(self): "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", "state/RI/adjusted_gross_income/amount/-inf_1", - "target_name: nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", + "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", ] # Run the optimization procedure to get (close to) minimum loss weights @@ -296,23 +296,6 @@ def generate(self): print(f"has rel_error: {rel_error[i]:.2f}\n") print("---End of reweighting quick diagnostics------") - print("\n\n---reweighting quick diagnostics----\n") - estimate = optimised_weights @ loss_matrix - rel_error = ( - ((estimate - targets_array) + 1) / (targets_array + 1) - ) ** 2 - print( - f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}", - f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}", - ) - print("Relative error over 100% for:") - for i in np.where(rel_error > 1)[0]: - print(f"target_name: {loss_matrix.columns[i]}") - print(f"target_value: {targets_array[i]}") - print(f"estimate_value: {estimate[i]}") - print(f"has rel_error: {rel_error.values[i]:.2f}\n") - print("---End of reweighting quick diagnostics------") - self.save_dataset(data) From 1fb4318b21072a9c5dbd2824216be49655f0b9b2 Mon Sep 17 00:00:00 2001 From: MaxGhenis Date: Mon, 14 Jul 2025 15:33:13 +0000 Subject: [PATCH 35/58] Update package version --- CHANGELOG.md | 11 +++++++++++ changelog.yaml | 9 +++++++++ changelog_entry.yaml | 8 -------- pyproject.toml | 2 +- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6299d8fb..e355d4dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.37.1] - 2025-07-14 15:33:11 + +### Changed + +- bad targets (causing problems with estimation) removed +- lite mode now builds CPS_2023 in addition to CPS_2024 +- gave reweight an epochs argument and set it at 150 for optimization +- updating minimum versions on policyengine-us and pandas dependencies +- getting rid of non-working manual workflow code + ## [1.37.0] - 2025-07-09 14:58:33 ### Added @@ -520,6 +530,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +[1.37.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.37.0...1.37.1 [1.37.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.2...1.37.0 [1.36.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.1...1.36.2 [1.36.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.0...1.36.1 diff --git a/changelog.yaml b/changelog.yaml index 699b2430..af7cdf32 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -433,3 +433,12 @@ added: - Medicaid state level calibration targets. date: 2025-07-09 14:58:33 +- bump: patch + changes: + changed: + - bad targets (causing problems with estimation) removed + - lite mode now builds CPS_2023 in addition to CPS_2024 + - gave reweight an epochs argument and set it at 150 for optimization + - updating minimum versions on policyengine-us and pandas dependencies + - getting rid of non-working manual workflow code + date: 2025-07-14 15:33:11 diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 3f9b8627..e69de29b 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,8 +0,0 @@ -- bump: patch - changes: - changed: - - bad targets (causing problems with estimation) removed - - lite mode now builds CPS_2023 in addition to CPS_2024 - - gave reweight an epochs argument and set it at 150 for optimization - - updating minimum versions on policyengine-us and pandas dependencies - - getting rid of non-working manual workflow code diff --git a/pyproject.toml b/pyproject.toml index f983258d..5a75693f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "policyengine_us_data" -version = "1.37.0" +version = "1.37.1" description = "A package to create representative microdata for the US." readme = "README.md" authors = [ From a62328a6f47293f90e1e696d03b49b96c044321b Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 11 Jul 2025 16:24:22 +0200 Subject: [PATCH 36/58] attempting to vectorize minimizing of ecps --- changelog_entry.yaml | 4 ++ .../datasets/cps/enhanced_cps.py | 53 +++++++------------ policyengine_us_data/utils/minimise.py | 51 ++++++++++++------ 3 files changed, 59 insertions(+), 49 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..84eeb584 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Enhanced CPS minimizing tests. \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 5c82d724..6616d54c 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -58,8 +58,8 @@ def reweight( np.log(original_weights), requires_grad=True, dtype=torch.float32 ) - # TODO: replace this functionality from the microcalibrate package. - def loss(weights): + # TO DO: replace this with a call to the python reweight.py package. + def loss(weights, penalty_approach="l0_sigmoid"): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -76,43 +76,30 @@ def loss(weights): if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") - if penalty_approach is not None: - # L0 penalty (approximated with smooth function) - # Since L0 is non-differentiable, we use a smooth approximation - # Common approaches: + # L0 penalty (approximated with smooth function) + # Since L0 is non-differentiable, we use a smooth approximation + # Common approaches: - epsilon = 1e-3 # Threshold for "near zero" - l0_penalty_weight = 1e-1 # Adjust this hyperparameter + epsilon = 1e-3 # Threshold for "near zero" + l0_penalty_weight = 1e-1 # Adjust this hyperparameter - # Option 1: Sigmoid approximation - if penalty_approach == "l0_sigmoid": - smoothed_l0 = torch.sigmoid( - (weights - epsilon) / (epsilon * 0.1) - ).mean() + # Option 1: Sigmoid approximation + if penalty_approach == "l0_sigmoid": + smoothed_l0 = torch.sigmoid( + (weights - epsilon) / (epsilon * 0.1) + ).mean() - # Option 2: Log-sum penalty (smoother) - if penalty_approach == "l0_log": - smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len( - weights - ) + # Option 2: Log-sum penalty (smoother) + if penalty_approach == "l0_log": + smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights) - # Option 3: Exponential penalty - if penalty_approach == "l0_exp": - smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() + # Option 3: Exponential penalty + if penalty_approach == "l0_exp": + smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() - # L1 penalty - l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs + # L1 penalty - if penalty_approach == "l1": - l1 = torch.mean(weights) - return rel_error_normalized.mean() + l1_penalty_weight * l1 - - return ( - rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 - ) - - else: - return rel_error_normalized.mean() + return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 def dropout_weights(weights, p): if p == 0: diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index df193c6e..ca985378 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -5,21 +5,10 @@ import pandas as pd import h5py from policyengine_us_data.storage import STORAGE_FOLDER -from typing import Optional, Callable +from typing import Optional -bad_targets = [ - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", - "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", - "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", -] - -def create_calibration_log_file(file_path, epoch=0): +def create_calibration_log_file(file_path): dataset = Dataset.from_file(file_path) loss_matrix, targets = build_loss_matrix(dataset, 2024) @@ -112,6 +101,27 @@ def losses_for_candidates( return losses +def minimise_dataset( + dataset, output_path: str, loss_rel_change_max: float +) -> None: + dataset = str(dataset) + create_calibration_log_file(dataset) + + dataset = Dataset.from_file(dataset) + loss_matrix = build_loss_matrix(dataset, 2024) + + sim = Microsimulation(dataset=dataset) + + weights = sim.calculate("household_weight", 2024).values + estimate_matrix, targets = loss_matrix + is_national = estimate_matrix.columns.str.startswith("nation/") + nation_normalisation_factor = is_national * (1 / is_national.sum()) + state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) + normalisation_factor = np.where( + is_national, nation_normalisation_factor, state_normalisation_factor + ) + weights @ estimate_matrix + def get_loss_from_mask( weights, inclusion_mask, estimate_matrix, targets, normalisation_factor ): @@ -185,16 +195,25 @@ def candidate_loss_contribution( replace=False, ) - # Compute losses for the batch in one shot + # more efficient approach to compute losses for candidate households to be removed + + # 1. sample only households that are currently *included* + indices = np.random.choice( + np.where(full_mask)[0], + size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION), + replace=False, + ) + # 2. compute losses for the batch in one shot candidate_losses = losses_for_candidates( weights, indices, estimate_matrix, targets, normalisation_factor ) - - # Convert to relative change vs. baseline + # 3. convert to relative change vs. baseline household_loss_rel_changes = ( candidate_losses - baseline_loss ) / baseline_loss + inclusion_mask = full_mask.copy() + household_loss_rel_changes = np.array(household_loss_rel_changes) # Sort by the relative change in loss sorted_indices = np.argsort(household_loss_rel_changes) From 6d3f8b4daea6ab498b105bf9429b74e52462cde4 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 11 Jul 2025 19:19:58 +0200 Subject: [PATCH 37/58] add notebook with testing functionality (havent tested locally) --- .../datasets/cps/enhanced_cps.py | 9 +- policyengine_us_data/utils/minimise.py | 2 +- test_minimization_approach.ipynb | 210 +----------------- 3 files changed, 16 insertions(+), 205 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 6616d54c..ca53a84d 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -40,7 +40,7 @@ def reweight( targets_array, dropout_rate=0.05, log_path="calibration_log.csv", - epochs=150, + penalty_approach="l0_sigmoid", ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -59,7 +59,7 @@ def reweight( ) # TO DO: replace this with a call to the python reweight.py package. - def loss(weights, penalty_approach="l0_sigmoid"): + def loss(weights, penalty_approach=penalty_approach): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -98,6 +98,11 @@ def loss(weights, penalty_approach="l0_sigmoid"): smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() # L1 penalty + l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs + + if penalty_approach == "l1": + l1 = torch.mean(weights) + return rel_error_normalized.mean() + l1_penalty_weight * l1 return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index ca985378..da2cb7d1 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -386,7 +386,7 @@ def minimise_dataset( f.create_dataset(f"{variable}/{year}", data=value) print(f"Saved minimised dataset to {output_path}") - create_calibration_log_file(output_path, epoch=500) + create_calibration_log_file(output_path) if __name__ == "__main__": diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 54f3c6fa..519d2725 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -12,199 +12,15 @@ "from policyengine_us import Microsimulation\n", "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", - "import numpy as np\n", - "import os" + "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, - "id": "6daabe7c", - "metadata": {}, - "outputs": [], - "source": [ - "# Original ECPS 2024 dataset size (for household entity): 41310\n", - "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n", - "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, "id": "db975ac1", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n", - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n", - "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n", - "Weight relative change: 99.10%\n", - "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n" - ] - } - ], + "outputs": [], "source": [ "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", "\n", @@ -212,20 +28,18 @@ " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", " ]\n", "\n", - "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n", - "minimization_function = candidate_loss_contribution\n", - "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", + "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n", + "minimization_function = random_sampling_minimization\n", + "# other minimization function approach is \"candidate_loss_contribution\"\n", "\n", "for file in files:\n", " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", - " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - "\n", " minimise_dataset(\n", " file,\n", " output_path,\n", + " loss_rel_change_max=10,\n", " minimization_function=minimization_function, \n", - " # target_fractions=[0.5] # remove if switching approach\n", - " loss_rel_change_max=0.0001, # remove if switching approach\n", + " target_fractions=[0.5] # remove if switching approach\n", " )" ] }, @@ -267,14 +81,6 @@ "\n", "data.save_dataset(output_path)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4cf8e89", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 94cacdeab007e318fe849bb3bbf4b29d7fcf627a Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Mon, 14 Jul 2025 15:22:58 +0200 Subject: [PATCH 38/58] few updates to the testing framework --- changelog_entry.yaml | 2 +- .../datasets/cps/enhanced_cps.py | 58 ++--- policyengine_us_data/utils/minimise.py | 59 +++++- pyproject.toml | 3 +- test_minimization_approach.ipynb | 198 +++++++++++++++++- 5 files changed, 280 insertions(+), 40 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 84eeb584..ac664753 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,4 @@ - bump: minor changes: added: - - Enhanced CPS minimizing tests. \ No newline at end of file + - Enhanced CPS minimizing tests. \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index ca53a84d..bf4b5501 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -40,7 +40,7 @@ def reweight( targets_array, dropout_rate=0.05, log_path="calibration_log.csv", - penalty_approach="l0_sigmoid", + penalty_approach=None, ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -76,35 +76,43 @@ def loss(weights, penalty_approach=penalty_approach): if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") - # L0 penalty (approximated with smooth function) - # Since L0 is non-differentiable, we use a smooth approximation - # Common approaches: + if penalty_approach is not None: + # L0 penalty (approximated with smooth function) + # Since L0 is non-differentiable, we use a smooth approximation + # Common approaches: - epsilon = 1e-3 # Threshold for "near zero" - l0_penalty_weight = 1e-1 # Adjust this hyperparameter + epsilon = 1e-3 # Threshold for "near zero" + l0_penalty_weight = 1e-1 # Adjust this hyperparameter - # Option 1: Sigmoid approximation - if penalty_approach == "l0_sigmoid": - smoothed_l0 = torch.sigmoid( - (weights - epsilon) / (epsilon * 0.1) - ).mean() + # Option 1: Sigmoid approximation + if penalty_approach == "l0_sigmoid": + smoothed_l0 = torch.sigmoid( + (weights - epsilon) / (epsilon * 0.1) + ).mean() - # Option 2: Log-sum penalty (smoother) - if penalty_approach == "l0_log": - smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights) + # Option 2: Log-sum penalty (smoother) + if penalty_approach == "l0_log": + smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len( + weights + ) - # Option 3: Exponential penalty - if penalty_approach == "l0_exp": - smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() + # Option 3: Exponential penalty + if penalty_approach == "l0_exp": + smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() - # L1 penalty - l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs + # L1 penalty + l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs - if penalty_approach == "l1": - l1 = torch.mean(weights) - return rel_error_normalized.mean() + l1_penalty_weight * l1 + if penalty_approach == "l1": + l1 = torch.mean(weights) + return rel_error_normalized.mean() + l1_penalty_weight * l1 - return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 + return ( + rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 + ) + + else: + return rel_error_normalized.mean() def dropout_weights(weights, p): if p == 0: @@ -249,9 +257,9 @@ def generate(self): loss_matrix, targets_array = build_loss_matrix( self.input_dataset, year ) - zero_mask = np.isclose(targets_array, 0.0, atol=0.1) + bad_mask = loss_matrix.columns.isin(bad_targets) - keep_mask_bool = ~(zero_mask | bad_mask) + keep_mask_bool = ~bad_mask keep_idx = np.where(keep_mask_bool)[0] loss_matrix_clean = loss_matrix.iloc[:, keep_idx] targets_array_clean = targets_array[keep_idx] diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index da2cb7d1..9c3d59eb 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -5,14 +5,33 @@ import pandas as pd import h5py from policyengine_us_data.storage import STORAGE_FOLDER -from typing import Optional +from typing import Optional, Callable +bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", +] -def create_calibration_log_file(file_path): + +def create_calibration_log_file(file_path, epoch=0): dataset = Dataset.from_file(file_path) loss_matrix, targets = build_loss_matrix(dataset, 2024) + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + assert loss_matrix_clean.shape[1] == targets_clean.size + loss_matrix, targets = build_loss_matrix(dataset, 2024) + bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~bad_mask keep_idx = np.where(keep_mask_bool)[0] @@ -22,6 +41,10 @@ def create_calibration_log_file(file_path): sim = Microsimulation(dataset=dataset) + estimates = ( + sim.calculate("household_weight", 2024).values @ loss_matrix_clean + ) + target_names = loss_matrix_clean.columns estimates = ( sim.calculate("household_weight", 2024).values @ loss_matrix_clean ) @@ -32,9 +55,11 @@ def create_calibration_log_file(file_path): "target_name": target_names, "estimate": estimates, "target": targets_clean, + "target": targets_clean, } ) df["epoch"] = epoch + df["epoch"] = epoch df["error"] = df["estimate"] - df["target"] df["rel_error"] = df["error"] / df["target"] df["abs_error"] = df["error"].abs() @@ -43,6 +68,11 @@ def create_calibration_log_file(file_path): if df["target"].abs().sum() > 0 else np.nan ) + df["rel_abs_error"] = ( + df["abs_error"] / df["target"].abs() + if df["target"].abs().sum() > 0 + else np.nan + ) df["loss"] = (df["rel_error"] ** 2).mean() df.to_csv( @@ -258,6 +288,7 @@ def random_sampling_minimization( targets, normalisation_factor, random=True, + random=True, target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5], ): """A simple random sampling approach""" @@ -265,6 +296,8 @@ def random_sampling_minimization( household_weights_normalized = weights / weights.sum() + household_weights_normalized = weights / weights.sum() + final_mask = None lowest_loss = float("inf") for fraction in target_fractions: @@ -283,6 +316,14 @@ def random_sampling_minimization( replace=False, ) ] = True + mask[ + np.random.choice( + n, + target_size, + p=household_weights_normalized if random else None, + replace=False, + ) + ] = True loss = get_loss_from_mask( weights, mask, estimate_matrix, targets, normalisation_factor @@ -322,6 +363,14 @@ def minimise_dataset( dataset = Dataset.from_file(dataset) loss_matrix, targets = build_loss_matrix(dataset, 2024) + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + assert loss_matrix_clean.shape[1] == targets_clean.size + loss_matrix, targets = build_loss_matrix(dataset, 2024) + bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~bad_mask keep_idx = np.where(keep_mask_bool)[0] @@ -333,6 +382,7 @@ def minimise_dataset( weights = sim.calculate("household_weight", 2024).values is_national = loss_matrix_clean.columns.str.startswith("nation/") + is_national = loss_matrix_clean.columns.str.startswith("nation/") nation_normalisation_factor = is_national * (1 / is_national.sum()) state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) normalisation_factor = np.where( @@ -344,8 +394,11 @@ def minimise_dataset( weights=weights, estimate_matrix=loss_matrix_clean, targets=targets_clean, + estimate_matrix=loss_matrix_clean, + targets=targets_clean, normalisation_factor=normalisation_factor, **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. + **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. ) # Extract household IDs for remaining households @@ -386,7 +439,7 @@ def minimise_dataset( f.create_dataset(f"{variable}/{year}", data=value) print(f"Saved minimised dataset to {output_path}") - create_calibration_log_file(output_path) + create_calibration_log_file(output_path, epoch=500) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 5a75693f..7f3e59b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,9 +17,8 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us>=1.340.1", + "policyengine-us>=1.340.0", "policyengine-core>=3.17.1", - "pandas>=2.3.0", "requests", "tqdm", "microdf_python>=0.4.3", diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 519d2725..5a7a9d15 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -12,15 +12,188 @@ "from policyengine_us import Microsimulation\n", "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", - "import numpy as np" + "import numpy as np\n", + "import os\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "db975ac1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n", + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n", + "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n", + "Removed 4131 households with worst relative loss changes.\n", + "Weight relative change: 52.19%\n", + "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n" + ] + } + ], "source": [ "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", "\n", @@ -28,18 +201,17 @@ " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", " ]\n", "\n", - "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n", - "minimization_function = random_sampling_minimization\n", - "# other minimization function approach is \"candidate_loss_contribution\"\n", + "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n", + "minimization_function = candidate_loss_contribution\n", + "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n", "\n", "for file in files:\n", " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", " minimise_dataset(\n", " file,\n", " output_path,\n", - " loss_rel_change_max=10,\n", " minimization_function=minimization_function, \n", - " target_fractions=[0.5] # remove if switching approach\n", + " #target_fractions=[0.5] # remove if switching approach\n", " )" ] }, @@ -81,6 +253,14 @@ "\n", "data.save_dataset(output_path)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4cf8e89", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From a71530b7b6f2723cfbf54a64f8f28f9d77e6da1d Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Mon, 14 Jul 2025 19:56:54 +0200 Subject: [PATCH 39/58] fix calibration for each approach --- .../datasets/cps/enhanced_cps.py | 1 + policyengine_us_data/utils/loss.py | 5 -- policyengine_us_data/utils/minimise.py | 89 ++++++++++++++----- test_minimization_approach.ipynb | 86 ++++++++++++------ 4 files changed, 129 insertions(+), 52 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index bf4b5501..33f62929 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -39,6 +39,7 @@ def reweight( loss_matrix, targets_array, dropout_rate=0.05, + epochs=500, log_path="calibration_log.csv", penalty_approach=None, ): diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 21abce0f..fbdbacef 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -552,11 +552,6 @@ def build_loss_matrix(dataset: type, time_period): # Convert to thousands for the target targets_array.append(row["enrollment"]) - print( - f"Targeting Medicaid enrollment for {row['state']} " - f"with target {row['enrollment']:.0f}k" - ) - # State 10-year age targets age_targets = pd.read_csv(STORAGE_FOLDER / "age_state.csv") diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 9c3d59eb..84c55d31 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -6,6 +6,7 @@ import h5py from policyengine_us_data.storage import STORAGE_FOLDER from typing import Optional, Callable +from policyengine_us_data.datasets.cps.enhanced_cps import reweight bad_targets = [ "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", @@ -20,35 +21,54 @@ def create_calibration_log_file(file_path, epoch=0): + print(f"=== CALIBRATION LOG DEBUG ===") + print(f"File path: {file_path}") + print(f"Epoch: {epoch}") + dataset = Dataset.from_file(file_path) + sim = Microsimulation(dataset=dataset) - loss_matrix, targets = build_loss_matrix(dataset, 2024) + # Debug: Print dataset info + household_weights = sim.calculate("household_weight", 2024) + print(f"Number of households: {len(household_weights)}") + print(f"Total weight: {household_weights.sum():.2f}") + print( + f"Weight range: {household_weights.min():.2f} to {household_weights.max():.2f}" + ) - bad_mask = loss_matrix.columns.isin(bad_targets) - keep_mask_bool = ~bad_mask - keep_idx = np.where(keep_mask_bool)[0] - loss_matrix_clean = loss_matrix.iloc[:, keep_idx] - targets_clean = targets[keep_idx] - assert loss_matrix_clean.shape[1] == targets_clean.size loss_matrix, targets = build_loss_matrix(dataset, 2024) + print(f"Loss matrix shape: {loss_matrix.shape}") + print(f"Number of targets: {len(targets)}") bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~bad_mask keep_idx = np.where(keep_mask_bool)[0] loss_matrix_clean = loss_matrix.iloc[:, keep_idx] targets_clean = targets[keep_idx] - assert loss_matrix_clean.shape[1] == targets_clean.size - sim = Microsimulation(dataset=dataset) + print(f"After filtering bad targets:") + print(f"Loss matrix clean shape: {loss_matrix_clean.shape}") + print(f"Number of clean targets: {len(targets_clean)}") + + assert loss_matrix_clean.shape[1] == targets_clean.size estimates = ( sim.calculate("household_weight", 2024).values @ loss_matrix_clean ) target_names = loss_matrix_clean.columns - estimates = ( - sim.calculate("household_weight", 2024).values @ loss_matrix_clean - ) - target_names = loss_matrix_clean.columns + + # Debug: Print estimate statistics + print(f"Estimates shape: {estimates.shape}") + print(f"Estimates sum: {estimates.sum():.2f}") + print(f"First 3 estimates: {estimates[:3]}") + print(f"First 3 targets: {targets_clean[:3]}") + + # Calculate and print some key metrics + errors = estimates - targets_clean + rel_errors = errors / targets_clean + print(f"Mean absolute error: {np.abs(errors).mean():.2f}") + print(f"Mean relative error: {np.abs(rel_errors).mean():.4f}") + print(f"=== END DEBUG ===\n") df = pd.DataFrame( { @@ -158,6 +178,7 @@ def get_loss_from_mask( """ Calculate the loss based on the inclusion mask and the estimate matrix. """ + # Step 1: Apply mask and rescale weights masked_weights = weights.copy() original_weight_total = masked_weights.sum() if (~inclusion_mask).sum() > 0: @@ -166,7 +187,26 @@ def get_loss_from_mask( masked_weights[inclusion_mask] *= ( original_weight_total / masked_weight_total ) - estimates = masked_weights @ estimate_matrix + + # Step 2: Re-calibrate the masked weights to hit targets + # Only calibrate the included households + included_weights = masked_weights[inclusion_mask] + included_estimate_matrix = estimate_matrix[inclusion_mask] + + # Call reweight function to calibrate the selected households + calibrated_weights_included = reweight( + included_weights, + included_estimate_matrix, + targets, + epochs=250, + ) + + # Put calibrated weights back into full array + calibrated_weights = np.zeros_like(masked_weights) + calibrated_weights[inclusion_mask] = calibrated_weights_included + + # Calculate estimates and loss from calibrated weights + estimates = calibrated_weights @ estimate_matrix rel_error = ((estimates - targets) + 1) / (targets + 1) loss = ((rel_error * normalisation_factor) ** 2).mean() @@ -288,8 +328,7 @@ def random_sampling_minimization( targets, normalisation_factor, random=True, - random=True, - target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5], + target_fractions=[0.5, 0.6, 0.7, 0.8, 0.9], ): """A simple random sampling approach""" n = len(weights) @@ -306,7 +345,7 @@ def random_sampling_minimization( best_mask = None best_loss = float("inf") - for _ in range(5): # Try 5 random samples + for _ in range(3): # Try 3 random samples mask = np.zeros(n, dtype=bool) mask[ np.random.choice( @@ -419,12 +458,20 @@ def minimise_dataset( sim = Microsimulation(dataset=smaller_df) # Rescale weights to maintain total - sim.set_input( - "household_weight", - 2024, - sim.calculate("household_weight", 2024).values / weight_rel_change, + initial_weights = ( + sim.calculate("household_weight", 2024).values / weight_rel_change ) + # Re-calibrate the final selected households to hit targets + print("Re-calibrating final selected households...") + calibrated_weights = reweight( + initial_weights, + loss_matrix_clean.values, # Convert to numpy array + targets_clean, + epochs=250, # Reduced epochs for faster processing + ) + sim.set_input("household_weight", 2024, calibrated_weights) + print("Final calibration completed successfully") # Prepare data for saving data = {} for variable in sim.input_variables: diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 5a7a9d15..6683da0c 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -13,12 +13,27 @@ "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", "import numpy as np\n", - "import os\n" + "import os" ] }, { "cell_type": "code", +<<<<<<< HEAD "execution_count": 7, +======= + "execution_count": null, + "id": "6daabe7c", + "metadata": {}, + "outputs": [], + "source": [ + "# Original ECPS 2024 dataset size (for household entity): 41310\n", + "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n", + "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "id": "db975ac1", "metadata": {}, "outputs": [ @@ -128,18 +143,17 @@ "Targeting Medicaid enrollment for WI with target 1108320k\n", "Targeting Medicaid enrollment for WV with target 467632k\n", "Targeting Medicaid enrollment for WY with target 57320k\n", - "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n", + "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n", + "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n", + "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n", + "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Weight relative change: 52.19%\n", - "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", + "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n", + "Weight relative change: 99.10%\n", + "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", "Targeting Medicaid enrollment for AK with target 231577k\n", "Targeting Medicaid enrollment for AL with target 766009k\n", "Targeting Medicaid enrollment for AR with target 733561k\n", @@ -203,32 +217,38 @@ "\n", "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n", "minimization_function = candidate_loss_contribution\n", - "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n", + "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", "\n", "for file in files:\n", " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + "\n", " minimise_dataset(\n", " file,\n", " output_path,\n", " minimization_function=minimization_function, \n", - " #target_fractions=[0.5] # remove if switching approach\n", + " # target_fractions=[0.5] # remove if switching approach\n", + " loss_rel_change_max=0.0001, # remove if switching approach\n", " )" ] }, { "cell_type": "code", - "execution_count": null, - "id": "35892c9d", + "execution_count": 4, + "id": "b4cf8e89", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250/250 [01:24<00:00, 2.98it/s, loss=3.37e-5, loss_rel_change=-0.92] \n" + ] + } + ], "source": [ - "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", - "\n", "input_dataset = ExtendedCPS_2024\n", "\n", - "approach = \"l0_sigmoid\"\n", - "# other options are \"l0_log\", \"l0_exp\", \"l1\"\n", - "\n", "sim = Microsimulation(dataset=input_dataset)\n", "data = sim.dataset.load_dataset()\n", "data[\"household_weight\"] = {}\n", @@ -240,18 +260,32 @@ " loss_matrix, targets_array = build_loss_matrix(\n", " input_dataset, year\n", " )\n", + "\n", + " bad_mask = loss_matrix.columns.isin(bad_targets)\n", + " keep_mask_bool = ~bad_mask\n", + " keep_idx = np.where(keep_mask_bool)[0]\n", + " loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n", + " targets_array_clean = targets_array[keep_idx]\n", + " assert loss_matrix_clean.shape[1] == targets_array_clean.size\n", + " assert loss_matrix_clean.shape[1] != loss_matrix.shape[1]\n", + "\n", " optimised_weights = reweight(\n", " original_weights,\n", - " loss_matrix,\n", - " targets_array,\n", - " log_path= STORAGE_FOLDER / approach / \"calibration_log.csv\",\n", - " penalty_approach=approach,\n", + " loss_matrix_clean,\n", + " targets_array_clean,\n", + " log_path=\"baseline_calibration_log.csv\",\n", + " epochs=250, # Reduced epochs for faster processing\n", " )\n", " data[\"household_weight\"][year] = optimised_weights\n", "\n", - "output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + "output_path = STORAGE_FOLDER / \"baseline\" / \"enhanced_cps_2024_baseline.h5\"\n", + "output_path.parent.mkdir(parents=True, exist_ok=True)\n", "\n", - "data.save_dataset(output_path)" + "# Save to HDF5 file\n", + "with h5py.File(output_path, \"w\") as f:\n", + " for variable, values in data.items():\n", + " for year, value in values.items():\n", + " f.create_dataset(f\"{variable}/{year}\", data=value)" ] }, { From f146620a9c71761336d7b1c49ae5e54b09f100e4 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Mon, 14 Jul 2025 20:19:38 +0200 Subject: [PATCH 40/58] fixed testing framework --- policyengine_us_data/utils/minimise.py | 39 +-- test_minimization_approach.ipynb | 330 ++++++++++--------------- 2 files changed, 134 insertions(+), 235 deletions(-) diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 84c55d31..b3e0ed1a 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -75,11 +75,9 @@ def create_calibration_log_file(file_path, epoch=0): "target_name": target_names, "estimate": estimates, "target": targets_clean, - "target": targets_clean, } ) df["epoch"] = epoch - df["epoch"] = epoch df["error"] = df["estimate"] - df["target"] df["rel_error"] = df["error"] / df["target"] df["abs_error"] = df["error"].abs() @@ -88,11 +86,6 @@ def create_calibration_log_file(file_path, epoch=0): if df["target"].abs().sum() > 0 else np.nan ) - df["rel_abs_error"] = ( - df["abs_error"] / df["target"].abs() - if df["target"].abs().sum() > 0 - else np.nan - ) df["loss"] = (df["rel_error"] ** 2).mean() df.to_csv( @@ -172,6 +165,7 @@ def minimise_dataset( ) weights @ estimate_matrix + def get_loss_from_mask( weights, inclusion_mask, estimate_matrix, targets, normalisation_factor ): @@ -264,15 +258,6 @@ def candidate_loss_contribution( size=int(full_mask.sum() * view_fraction_per_iteration), replace=False, ) - - # more efficient approach to compute losses for candidate households to be removed - - # 1. sample only households that are currently *included* - indices = np.random.choice( - np.where(full_mask)[0], - size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION), - replace=False, - ) # 2. compute losses for the batch in one shot candidate_losses = losses_for_candidates( weights, indices, estimate_matrix, targets, normalisation_factor @@ -335,8 +320,6 @@ def random_sampling_minimization( household_weights_normalized = weights / weights.sum() - household_weights_normalized = weights / weights.sum() - final_mask = None lowest_loss = float("inf") for fraction in target_fractions: @@ -355,14 +338,6 @@ def random_sampling_minimization( replace=False, ) ] = True - mask[ - np.random.choice( - n, - target_size, - p=household_weights_normalized if random else None, - replace=False, - ) - ] = True loss = get_loss_from_mask( weights, mask, estimate_matrix, targets, normalisation_factor @@ -402,14 +377,6 @@ def minimise_dataset( dataset = Dataset.from_file(dataset) loss_matrix, targets = build_loss_matrix(dataset, 2024) - bad_mask = loss_matrix.columns.isin(bad_targets) - keep_mask_bool = ~bad_mask - keep_idx = np.where(keep_mask_bool)[0] - loss_matrix_clean = loss_matrix.iloc[:, keep_idx] - targets_clean = targets[keep_idx] - assert loss_matrix_clean.shape[1] == targets_clean.size - loss_matrix, targets = build_loss_matrix(dataset, 2024) - bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~bad_mask keep_idx = np.where(keep_mask_bool)[0] @@ -421,7 +388,6 @@ def minimise_dataset( weights = sim.calculate("household_weight", 2024).values is_national = loss_matrix_clean.columns.str.startswith("nation/") - is_national = loss_matrix_clean.columns.str.startswith("nation/") nation_normalisation_factor = is_national * (1 / is_national.sum()) state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) normalisation_factor = np.where( @@ -433,11 +399,8 @@ def minimise_dataset( weights=weights, estimate_matrix=loss_matrix_clean, targets=targets_clean, - estimate_matrix=loss_matrix_clean, - targets=targets_clean, normalisation_factor=normalisation_factor, **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. - **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. ) # Extract household IDs for remaining households diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 6683da0c..7c416e2a 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -13,228 +13,172 @@ "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", "import numpy as np\n", - "import os" + "import os\n", + "import h5py\n", + "\n", + "bad_targets = [\n", + " \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household\",\n", + " \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household\",\n", + " \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n", + " \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n", + " \"nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household\",\n", + " \"nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household\",\n", + " \"nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n", + " \"nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n", + "]" ] }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 7, -======= "execution_count": null, - "id": "6daabe7c", + "id": "683fd57e", "metadata": {}, "outputs": [], "source": [ - "# Original ECPS 2024 dataset size (for household entity): 41310\n", - "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n", - "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n" + "# Length of household entity in the dataset measured through household_weight:\n", + "\n", + "# Original ECPS 2024 dataset size: 41310\n", + "# Through \"random_sampling_minimization\" with 0.5 of the dataset being pruned: 20655\n", + "# Through \"random_sampling_minimization\" with 0.2 of the dataset being pruned: 33408\n", + "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change: 20655 \n", + "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change: 24786" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "db975ac1", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n", - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n", - "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n", - "Weight relative change: 99.10%\n", - "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n" - ] - } - ], + "outputs": [], "source": [ + "## ALL TESTS\n", + "\n", + "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", + "\n", + "input_dataset = ExtendedCPS_2024\n", + "\n", + "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", + "\n", + "for approach in approaches:\n", + " sim = Microsimulation(dataset=input_dataset)\n", + " data = sim.dataset.load_dataset()\n", + " data[\"household_weight\"] = {}\n", + " original_weights = sim.calculate(\"household_weight\")\n", + " original_weights = original_weights.values + np.random.normal(\n", + " 1, 0.1, len(original_weights)\n", + " )\n", + " for year in range(2024, 2025):\n", + " loss_matrix, targets_array = build_loss_matrix(\n", + " input_dataset, year\n", + " )\n", + "\n", + " bad_mask = loss_matrix.columns.isin(bad_targets)\n", + " keep_mask_bool = ~bad_mask\n", + " keep_idx = np.where(keep_mask_bool)[0]\n", + " loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n", + " targets_array_clean = targets_array[keep_idx]\n", + " assert loss_matrix_clean.shape[1] == targets_array_clean.size\n", + "\n", + " optimised_weights = reweight(\n", + " original_weights,\n", + " loss_matrix_clean,\n", + " targets_array_clean,\n", + " log_path=\"calibration_log.csv\",\n", + " penalty_approach=approach,\n", + " epochs=250, # Reduced epochs for faster processing\n", + " )\n", + " data[\"household_weight\"][year] = optimised_weights\n", + "\n", + " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + "\n", + " # Save to HDF5 file\n", + " with h5py.File(output_path, \"w\") as f:\n", + " for variable, values in data.items():\n", + " for year, value in values.items():\n", + " f.create_dataset(f\"{variable}/{year}\", data=value)\n", + "\n", + "\n", "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", "\n", "files = [\n", " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", " ]\n", "\n", - "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n", - "minimization_function = candidate_loss_contribution\n", + "approaches = {\n", + " \"random_sampling_minimization\": random_sampling_minimization,\n", + " \"candidate_loss_contribution\": candidate_loss_contribution,\n", + "}\n", + "\n", + "optional_params = {\n", + " \"random_sampling_minimization\": {\n", + " \"target_fractions\": [0.5, 0.6, 0.7, 0.8, 0.9], # fractions of the dataset to keep\n", + " },\n", + " \"candidate_loss_contribution\": {\n", + " \"loss_rel_change_max\": [0.00001, 0.000001, 0.0000001] # maximum relative change in\n", + " }\n", + "}\n", + "\n", + "for approach, function in approaches.items():\n", + " minimization_function = function\n", + " # other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", + "\n", + " for params, values in optional_params[approach].items():\n", + " for value in values:\n", + " if params == \"target_fractions\":\n", + " for file in files:\n", + " output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + " minimise_dataset(\n", + " file,\n", + " output_path,\n", + " minimization_function=minimization_function, \n", + " target_fractions=[value]\n", + " )\n", + " elif params == \"loss_rel_change_max\":\n", + " for file in files:\n", + " output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + " minimise_dataset(\n", + " file,\n", + " output_path,\n", + " minimization_function=minimization_function, \n", + " loss_rel_change_max=value\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35892c9d", + "metadata": {}, + "outputs": [], + "source": [ + "## SMALL CHECKS BELOW -- IGNORE ---\n", + "\n", + "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", + "\n", + "files = [\n", + " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", + " ]\n", + "\n", + "minimization_function = random_sampling_minimization\n", "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", "\n", "for file in files:\n", - " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + " output_path = STORAGE_FOLDER / \"random_sampling_minimization\" / f\"{1.0}_enhanced_cps_2024_minimised.h5\"\n", " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - "\n", " minimise_dataset(\n", " file,\n", " output_path,\n", " minimization_function=minimization_function, \n", - " # target_fractions=[0.5] # remove if switching approach\n", - " loss_rel_change_max=0.0001, # remove if switching approach\n", + " target_fractions=[1.0]\n", " )" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "b4cf8e89", "metadata": {}, "outputs": [ @@ -287,14 +231,6 @@ " for year, value in values.items():\n", " f.create_dataset(f\"{variable}/{year}\", data=value)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4cf8e89", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 68349f80e0b2d6af2893eb738dde6c0a8b7eb9bd Mon Sep 17 00:00:00 2001 From: eccuraa Date: Mon, 14 Jul 2025 14:23:01 -0400 Subject: [PATCH 41/58] starting to collect results --- test_minimization_approach.ipynb | 152 ++++++++++++++++++++++++++++++- 1 file changed, 148 insertions(+), 4 deletions(-) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 54f3c6fa..ffed7e46 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -5,7 +5,16 @@ "execution_count": 1, "id": "d6dc9cca", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n", "from policyengine_us_data.storage import STORAGE_FOLDER\n", @@ -268,18 +277,153 @@ "data.save_dataset(output_path)" ] }, + { + "cell_type": "markdown", + "id": "fedc4fc7", + "metadata": {}, + "source": [ + "## FULL DATA DOWNLOAD PIPELINE" + ] + }, + { + "cell_type": "markdown", + "id": "2218e211", + "metadata": {}, + "source": [ + "Set up line plot dataframe, initializing it with the original enhanced_cps results." + ] + }, { "cell_type": "code", "execution_count": null, - "id": "b4cf8e89", + "id": "3b1bba26", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# Create initial DataFrame and adding structure\n", + "results_df = pd.DataFrame({\n", + " 'strategy': ['none'],\n", + " 'parameter': ['none'],\n", + " 'dataset_size': [41600],\n", + " 'total_loss': [6.9e-3]\n", + "})\n", + "\n", + "def add_result(df, strategy, parameter, dataset_size, total_loss):\n", + " new_rows = pd.DataFrame({\n", + " 'strategy': strategy,\n", + " 'parameter': parameter,\n", + " 'dataset_size': dataset_size,\n", + " 'total_loss': total_loss\n", + " })\n", + " return pd.concat([df, new_rows], ignore_index=True)\n", + "\n", + "# Example usage:\n", + "#df = add_result(df, ['L1', 'L2'], ['0.001','0.002'] , [35000, 4000], [7.2e-3, 7.2e-3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6df48427", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
strategyparameterdataset_sizetotal_loss
0nonenone416000.0069
1L10.001350000.0072
2L20.00240000.0072
\n", + "
" + ], + "text/plain": [ + " strategy parameter dataset_size total_loss\n", + "0 none none 41600 0.0069\n", + "1 L1 0.001 35000 0.0072\n", + "2 L2 0.002 4000 0.0072" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "markdown", + "id": "aa483f59", + "metadata": {}, + "source": [ + "Collecting length of dataset and total loss values for every regularization strategy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9eb9602c", "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "markdown", + "id": "f5023a3a", + "metadata": {}, + "source": [ + "### VISUALIZATION" + ] } ], "metadata": { "kernelspec": { - "display_name": "pe", + "display_name": "policyengine-us-data", "language": "python", "name": "python3" }, @@ -293,7 +437,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.11" + "version": "3.11.13" } }, "nbformat": 4, From 4d593b99e049004ae354ca8a6349056c5b57108c Mon Sep 17 00:00:00 2001 From: eccuraa Date: Mon, 14 Jul 2025 17:07:22 -0400 Subject: [PATCH 42/58] added functionality for running multiple L0/L1 penalty values & dataframe for plotting --- .../datasets/cps/enhanced_cps.py | 14 +- test_minimization_approach.ipynb | 382 ++++++++++++++++-- 2 files changed, 349 insertions(+), 47 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 83fe6b99..851ea464 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -42,6 +42,8 @@ def reweight( epochs=500, log_path="calibration_log.csv", penalty_approach=None, + penalty_weight=None, + ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -60,7 +62,7 @@ def reweight( ) # TO DO: replace this with a call to the python reweight.py package. - def loss(weights, penalty_approach=penalty_approach): + def loss(weights, penalty_approach=penalty_approach, penalty_weight=penalty_weight): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -77,13 +79,13 @@ def loss(weights, penalty_approach=penalty_approach): if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") - if penalty_approach is not None: + if penalty_approach is not None and penalty_weight is not None: # L0 penalty (approximated with smooth function) # Since L0 is non-differentiable, we use a smooth approximation # Common approaches: epsilon = 1e-3 # Threshold for "near zero" - l0_penalty_weight = 1e-1 # Adjust this hyperparameter + # Option 1: Sigmoid approximation if penalty_approach == "l0_sigmoid": @@ -101,15 +103,13 @@ def loss(weights, penalty_approach=penalty_approach): if penalty_approach == "l0_exp": smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() - # L1 penalty - l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs if penalty_approach == "l1": l1 = torch.mean(weights) - return rel_error_normalized.mean() + l1_penalty_weight * l1 + return rel_error_normalized.mean() + penalty_weight * l1 return ( - rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 + rel_error_normalized.mean() + penalty_weight * smoothed_l0 ) else: diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 7c416e2a..bb77568a 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -15,6 +15,8 @@ "import numpy as np\n", "import os\n", "import h5py\n", + "import pandas as pd\n", + "\n", "\n", "bad_targets = [\n", " \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household\",\n", @@ -58,45 +60,58 @@ "input_dataset = ExtendedCPS_2024\n", "\n", "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", + "penalty_weights = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]\n", "\n", - "for approach in approaches:\n", - " sim = Microsimulation(dataset=input_dataset)\n", - " data = sim.dataset.load_dataset()\n", - " data[\"household_weight\"] = {}\n", - " original_weights = sim.calculate(\"household_weight\")\n", - " original_weights = original_weights.values + np.random.normal(\n", - " 1, 0.1, len(original_weights)\n", - " )\n", - " for year in range(2024, 2025):\n", - " loss_matrix, targets_array = build_loss_matrix(\n", - " input_dataset, year\n", - " )\n", - "\n", - " bad_mask = loss_matrix.columns.isin(bad_targets)\n", - " keep_mask_bool = ~bad_mask\n", - " keep_idx = np.where(keep_mask_bool)[0]\n", - " loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n", - " targets_array_clean = targets_array[keep_idx]\n", - " assert loss_matrix_clean.shape[1] == targets_array_clean.size\n", - "\n", - " optimised_weights = reweight(\n", - " original_weights,\n", - " loss_matrix_clean,\n", - " targets_array_clean,\n", - " log_path=\"calibration_log.csv\",\n", - " penalty_approach=approach,\n", - " epochs=250, # Reduced epochs for faster processing\n", - " )\n", - " data[\"household_weight\"][year] = optimised_weights\n", - "\n", - " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + "def get_output_path(approach, file_name):\n", + " output_path = STORAGE_FOLDER / approach / file_name\n", " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + " return output_path\n", + "\n", + "results = []\n", "\n", - " # Save to HDF5 file\n", - " with h5py.File(output_path, \"w\") as f:\n", - " for variable, values in data.items():\n", - " for year, value in values.items():\n", - " f.create_dataset(f\"{variable}/{year}\", data=value)\n", + "for approach in approaches:\n", + " for penalty_weight in penalty_weights:\n", + " # Storing files in correct locations\n", + " cal_log_name = f\"calibration_log_{approach}_{penalty_weight}.csv\"\n", + " h5_name = f\"enhanced_cps_2024_{approach}_{penalty_weight}_minimised.h5\"\n", + " cal_log_path = get_output_path(approach, cal_log_name)\n", + " h5_path = get_output_path(approach, h5_name)\n", + "\n", + " sim = Microsimulation(dataset=input_dataset)\n", + " data = sim.dataset.load_dataset()\n", + " data[\"household_weight\"] = {}\n", + " original_weights = sim.calculate(\"household_weight\")\n", + " original_weights = original_weights.values + np.random.normal(\n", + " 1, 0.1, len(original_weights)\n", + " )\n", + " for year in range(2024, 2025):\n", + " loss_matrix, targets_array = build_loss_matrix(\n", + " input_dataset, year\n", + " )\n", + "\n", + " bad_mask = loss_matrix.columns.isin(bad_targets)\n", + " keep_mask_bool = ~bad_mask\n", + " keep_idx = np.where(keep_mask_bool)[0]\n", + " loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n", + " targets_array_clean = targets_array[keep_idx]\n", + " assert loss_matrix_clean.shape[1] == targets_array_clean.size\n", + "\n", + " optimised_weights = reweight(\n", + " original_weights,\n", + " loss_matrix_clean,\n", + " targets_array_clean,\n", + " log_path=cal_log_path, \n", + " penalty_approach=approach,\n", + " penalty_weight=penalty_weight, \n", + " epochs=10, # Reduced epochs for faster processing\n", + " )\n", + " data[\"household_weight\"][year] = optimised_weights\n", + "\n", + " # Save to HDF5 file\n", + " with h5py.File(h5_path, \"w\") as f:\n", + " for variable, values in data.items():\n", + " for year, value in values.items():\n", + " f.create_dataset(f\"{variable}/{year}\", data=value)\n", "\n", "\n", "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", @@ -115,7 +130,7 @@ " \"target_fractions\": [0.5, 0.6, 0.7, 0.8, 0.9], # fractions of the dataset to keep\n", " },\n", " \"candidate_loss_contribution\": {\n", - " \"loss_rel_change_max\": [0.00001, 0.000001, 0.0000001] # maximum relative change in\n", + " \"loss_rel_change_max\": [0.001, 0.0001, 0.00001, 0.000001, 0.0000001] # maximum relative change in loss\n", " }\n", "}\n", "\n", @@ -149,10 +164,172 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "35892c9d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n", + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n", + "Weight relative change: 100.00%\n", + "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/random_sampling_minimization/1.0_enhanced_cps_2024_random_sampling_minimization_minimised.h5\n", + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n" + ] + } + ], "source": [ "## SMALL CHECKS BELOW -- IGNORE ---\n", "\n", @@ -166,7 +343,7 @@ "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", "\n", "for file in files:\n", - " output_path = STORAGE_FOLDER / \"random_sampling_minimization\" / f\"{1.0}_enhanced_cps_2024_minimised.h5\"\n", + " output_path = STORAGE_FOLDER / \"random_sampling_minimization\" / f\"{1.0}_enhanced_cps_2024_random_sampling_minimization_minimised.h5\"\n", " output_path.parent.mkdir(parents=True, exist_ok=True)\n", " minimise_dataset(\n", " file,\n", @@ -231,6 +408,131 @@ " for year, value in values.items():\n", " f.create_dataset(f\"{variable}/{year}\", data=value)" ] + }, + { + "cell_type": "markdown", + "id": "f8b0fe2e", + "metadata": {}, + "source": [ + "### Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "225debd8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
strategyparameterdataset_sizetotal_loss
0originalnone416000.0069
1L10.001350000.0072
2L20.00240000.0072
\n", + "
" + ], + "text/plain": [ + " strategy parameter dataset_size total_loss\n", + "0 original none 41600 0.0069\n", + "1 L1 0.001 35000 0.0072\n", + "2 L2 0.002 4000 0.0072" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"\n", + "Creating dataframe to store regularization results\n", + "\"\"\"\n", + "\n", + "# Initial dataframe setup\n", + "df = pd.DataFrame({\n", + " 'strategy': ['none'],\n", + " 'parameter': ['none'],\n", + " 'dataset_size': [41310],\n", + " 'total_loss': [6.9e-3]\n", + "})\n", + "\n", + "def add_result(df, strategy, parameter, dataset_size, total_loss):\n", + " new_rows = pd.DataFrame({\n", + " 'strategy': strategy, \n", + " 'parameter': parameter, \n", + " 'dataset_size': dataset_size,\n", + " 'total_loss': total_loss\n", + " })\n", + " return pd.concat([df, new_rows], ignore_index=True)\n", + "\n", + "# Example usage\n", + "#df = add_result(df, ['L1', 'L2'], ['0.001','0.002'], [35000, 4000], [7.2e-3, 7.2e-3])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bb3ef3c", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Pulling values from created calibration_log.csv and .h5 files to populate the line plot dataframe\n", + "\n", + "( I need to pull the strategy (folder name), parameter (from file title??), dataset size (from length of .h5 file), and total loss (from sum of loss column in calibration_log_file.csv))\n", + "\"\"\"\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] } ], "metadata": { From a8af62a73d438efce6e02600916ea7ff7d11272f Mon Sep 17 00:00:00 2001 From: eccuraa Date: Mon, 14 Jul 2025 20:56:08 -0400 Subject: [PATCH 43/58] pulling data from files for plotting --- test_minimization_approach.ipynb | 288 +++++++++++++++++++++++++++---- 1 file changed, 257 insertions(+), 31 deletions(-) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index bb77568a..e3d011af 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,10 +2,19 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "d6dc9cca", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n", "from policyengine_us_data.storage import STORAGE_FOLDER\n", @@ -30,6 +39,82 @@ "]" ] }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2f27c5ab", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 10/10 [00:04<00:00, 2.12it/s, loss=0.0101, loss_rel_change=-0.0371]\n", + "100%|██████████| 10/10 [00:05<00:00, 1.74it/s, loss=0.1, loss_rel_change=-0.00389]\n", + "100%|██████████| 10/10 [00:06<00:00, 1.62it/s, loss=3.22, loss_rel_change=-0.896]\n", + "100%|██████████| 10/10 [00:04<00:00, 2.15it/s, loss=32, loss_rel_change=-0.896] \n" + ] + } + ], + "source": [ + "\n", + "input_dataset = ExtendedCPS_2024\n", + "\n", + "approaches = [\"l0_exp\", \"l1\"] #[\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", + "penalty_weights = [1e-2, 1e-1] #[1e-5, 1e-4, 1e-3, 1e-2, 1e-1]\n", + "\n", + "def get_output_path(approach, file_name):\n", + " output_path = STORAGE_FOLDER / approach / file_name\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + " return output_path\n", + "\n", + "results = []\n", + "\n", + "for approach in approaches:\n", + " for penalty_weight in penalty_weights:\n", + " # Storing files in correct locations\n", + " cal_log_name = f\"calibration_log_{approach}_{penalty_weight}.csv\"\n", + " h5_name = f\"enhanced_cps_2024_{approach}_{penalty_weight}_minimised.h5\"\n", + " cal_log_path = get_output_path(approach, cal_log_name)\n", + " h5_path = get_output_path(approach, h5_name)\n", + "\n", + " sim = Microsimulation(dataset=input_dataset)\n", + " data = sim.dataset.load_dataset()\n", + " data[\"household_weight\"] = {}\n", + " original_weights = sim.calculate(\"household_weight\")\n", + " original_weights = original_weights.values + np.random.normal(\n", + " 1, 0.1, len(original_weights)\n", + " )\n", + " for year in range(2024, 2025):\n", + " loss_matrix, targets_array = build_loss_matrix(\n", + " input_dataset, year\n", + " )\n", + "\n", + " bad_mask = loss_matrix.columns.isin(bad_targets)\n", + " keep_mask_bool = ~bad_mask\n", + " keep_idx = np.where(keep_mask_bool)[0]\n", + " loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n", + " targets_array_clean = targets_array[keep_idx]\n", + " assert loss_matrix_clean.shape[1] == targets_array_clean.size\n", + "\n", + " optimised_weights = reweight(\n", + " original_weights,\n", + " loss_matrix_clean,\n", + " targets_array_clean,\n", + " log_path=cal_log_path, \n", + " penalty_approach=approach,\n", + " penalty_weight=penalty_weight, \n", + " epochs=10, # Reduced epochs for faster processing\n", + " )\n", + " data[\"household_weight\"][year] = optimised_weights\n", + "\n", + " # Save to HDF5 file\n", + " with h5py.File(h5_path, \"w\") as f:\n", + " for variable, values in data.items():\n", + " for year, value in values.items():\n", + " f.create_dataset(f\"{variable}/{year}\", data=value)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -419,7 +504,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "225debd8", "metadata": {}, "outputs": [ @@ -453,37 +538,21 @@ " \n", " \n", " 0\n", - " original\n", " none\n", - " 41600\n", + " none\n", + " 41310\n", " 0.0069\n", " \n", - " \n", - " 1\n", - " L1\n", - " 0.001\n", - " 35000\n", - " 0.0072\n", - " \n", - " \n", - " 2\n", - " L2\n", - " 0.002\n", - " 4000\n", - " 0.0072\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " strategy parameter dataset_size total_loss\n", - "0 original none 41600 0.0069\n", - "1 L1 0.001 35000 0.0072\n", - "2 L2 0.002 4000 0.0072" + " strategy parameter dataset_size total_loss\n", + "0 none none 41310 0.0069" ] }, - "execution_count": 12, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -494,7 +563,7 @@ "\"\"\"\n", "\n", "# Initial dataframe setup\n", - "df = pd.DataFrame({\n", + "reg_results_df = pd.DataFrame({\n", " 'strategy': ['none'],\n", " 'parameter': ['none'],\n", " 'dataset_size': [41310],\n", @@ -508,19 +577,116 @@ " 'dataset_size': dataset_size,\n", " 'total_loss': total_loss\n", " })\n", - " return pd.concat([df, new_rows], ignore_index=True)\n", + " return pd.concat([reg_results_df, new_rows], ignore_index=True)\n", "\n", "# Example usage\n", - "#df = add_result(df, ['L1', 'L2'], ['0.001','0.002'], [35000, 4000], [7.2e-3, 7.2e-3])\n", - "df" + "#reg_results_df = add_result(reg_results_df, ['L1', 'L2'], ['0.001','0.002'], [35000, 4000], [7.2e-3, 7.2e-3])\n", + "reg_results_df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "7bb3ef3c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
strategyparameterdataset_sizetotal_loss
0nonenone413100.006900
1l0_exp0.01413101263.410322
2l0_exp0.1413101263.410322
3l0_exp0.1413101263.410322
4l10.01413101263.410322
5l10.1413101263.410322
6l10.1413101263.410322
\n", + "
" + ], + "text/plain": [ + " strategy parameter dataset_size total_loss\n", + "0 none none 41310 0.006900\n", + "1 l0_exp 0.01 41310 1263.410322\n", + "2 l0_exp 0.1 41310 1263.410322\n", + "3 l0_exp 0.1 41310 1263.410322\n", + "4 l1 0.01 41310 1263.410322\n", + "5 l1 0.1 41310 1263.410322\n", + "6 l1 0.1 41310 1263.410322" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "\"\"\"\n", "Pulling values from created calibration_log.csv and .h5 files to populate the line plot dataframe\n", @@ -528,10 +694,70 @@ "( I need to pull the strategy (folder name), parameter (from file title??), dataset size (from length of .h5 file), and total loss (from sum of loss column in calibration_log_file.csv))\n", "\"\"\"\n", "\n", + "approaches = [\"l0_exp\", \"l1\"] \n", + "penalty_weights = [1e-2, 1e-1]\n", + "\n", + "def get_output_path(approach, file_name):\n", + " output_path = STORAGE_FOLDER / approach / file_name\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + " return output_path\n", + "\n", + "for approach in approaches:\n", + " total_size = []\n", + " total_loss = []\n", + " for penalty_weight in penalty_weights:\n", + " strategy = approach\n", + " parameter = penalty_weight\n", + "\n", + " # Pull length of .h5 file\n", + " h5_name = f\"enhanced_cps_2024_{strategy}_{parameter}_minimised.h5\"\n", + " h5_path = get_output_path(strategy, h5_name)\n", + " # see if this works\n", + " dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n", + " total_size.append(dataset_size)\n", + "\n", + " # Pull sum of loss column\n", + " cal_log_name = f\"calibration_log_{approach}_{penalty_weight}.csv\"\n", + " cal_log_path = get_output_path(approach, cal_log_name)\n", + " loss_sum = pd.read_csv(cal_log_path)['loss'].sum()\n", + " total_loss.append(loss_sum)\n", + "\n", + " reg_results_df = add_result(reg_results_df, strategy, parameter, total_size, total_loss)\n", + " # does this weird recursion work?\n", + "\n", + "\n", + "\n", "\n", + "'''\n", "\n", + "fraction = [0.5, 0.6, 0.7, 0.8, 0.9]\n", "\n", - "\n" + "for fraction in fraction:\n", + " strategy = \"random_sampling_minimization\"\n", + " parameter = fraction\n", + "\n", + " # Pull length of .h5 file\n", + " h5_name = f\"{fraction}_enhanced_cps_2024_random_sampling_minimization_minimised.h5\"\n", + " h5_path = STORAGE_FOLDER / strategy / h5_name\n", + " dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n", + "\n", + " # Pull sum of loss column\n", + " cal_log_name = f\"{fraction}_enhanced_cps_2024_random_sampling_minimization_minimised_calibration_log.csv\"\n", + " cal_log_path = STORAGE_FOLDER / strategy / cal_log_name\n", + " total_loss = pd.read_csv(cal_log_path)['loss'].sum()\n", + "\n", + " add_result(df, strategy, parameter, dataset_size, total_loss)\n", + "\n", + "'''\n", + "reg_results_df\n" + ] + }, + { + "cell_type": "markdown", + "id": "5b203ccd", + "metadata": {}, + "source": [ + "## Plotting" ] } ], From a917d35200d6aac3d25286c2abad7cad3b1b4ba3 Mon Sep 17 00:00:00 2001 From: eccuraa Date: Mon, 14 Jul 2025 21:15:06 -0400 Subject: [PATCH 44/58] deleted testing cell --- test_minimization_approach.ipynb | 105 +++++-------------------------- 1 file changed, 15 insertions(+), 90 deletions(-) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index e3d011af..972bd0b7 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,19 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "d6dc9cca", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n", "from policyengine_us_data.storage import STORAGE_FOLDER\n", @@ -39,82 +30,6 @@ "]" ] }, - { - "cell_type": "code", - "execution_count": 2, - "id": "2f27c5ab", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 10/10 [00:04<00:00, 2.12it/s, loss=0.0101, loss_rel_change=-0.0371]\n", - "100%|██████████| 10/10 [00:05<00:00, 1.74it/s, loss=0.1, loss_rel_change=-0.00389]\n", - "100%|██████████| 10/10 [00:06<00:00, 1.62it/s, loss=3.22, loss_rel_change=-0.896]\n", - "100%|██████████| 10/10 [00:04<00:00, 2.15it/s, loss=32, loss_rel_change=-0.896] \n" - ] - } - ], - "source": [ - "\n", - "input_dataset = ExtendedCPS_2024\n", - "\n", - "approaches = [\"l0_exp\", \"l1\"] #[\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", - "penalty_weights = [1e-2, 1e-1] #[1e-5, 1e-4, 1e-3, 1e-2, 1e-1]\n", - "\n", - "def get_output_path(approach, file_name):\n", - " output_path = STORAGE_FOLDER / approach / file_name\n", - " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - " return output_path\n", - "\n", - "results = []\n", - "\n", - "for approach in approaches:\n", - " for penalty_weight in penalty_weights:\n", - " # Storing files in correct locations\n", - " cal_log_name = f\"calibration_log_{approach}_{penalty_weight}.csv\"\n", - " h5_name = f\"enhanced_cps_2024_{approach}_{penalty_weight}_minimised.h5\"\n", - " cal_log_path = get_output_path(approach, cal_log_name)\n", - " h5_path = get_output_path(approach, h5_name)\n", - "\n", - " sim = Microsimulation(dataset=input_dataset)\n", - " data = sim.dataset.load_dataset()\n", - " data[\"household_weight\"] = {}\n", - " original_weights = sim.calculate(\"household_weight\")\n", - " original_weights = original_weights.values + np.random.normal(\n", - " 1, 0.1, len(original_weights)\n", - " )\n", - " for year in range(2024, 2025):\n", - " loss_matrix, targets_array = build_loss_matrix(\n", - " input_dataset, year\n", - " )\n", - "\n", - " bad_mask = loss_matrix.columns.isin(bad_targets)\n", - " keep_mask_bool = ~bad_mask\n", - " keep_idx = np.where(keep_mask_bool)[0]\n", - " loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n", - " targets_array_clean = targets_array[keep_idx]\n", - " assert loss_matrix_clean.shape[1] == targets_array_clean.size\n", - "\n", - " optimised_weights = reweight(\n", - " original_weights,\n", - " loss_matrix_clean,\n", - " targets_array_clean,\n", - " log_path=cal_log_path, \n", - " penalty_approach=approach,\n", - " penalty_weight=penalty_weight, \n", - " epochs=10, # Reduced epochs for faster processing\n", - " )\n", - " data[\"household_weight\"][year] = optimised_weights\n", - "\n", - " # Save to HDF5 file\n", - " with h5py.File(h5_path, \"w\") as f:\n", - " for variable, values in data.items():\n", - " for year, value in values.items():\n", - " f.create_dataset(f\"{variable}/{year}\", data=value)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -136,7 +51,17 @@ "execution_count": null, "id": "db975ac1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 10/10 [00:03<00:00, 3.00it/s, loss=9.1e-5, loss_rel_change=-0.809] \n", + "100%|██████████| 10/10 [00:03<00:00, 2.96it/s, loss=0.000181, loss_rel_change=-0.679]\n", + "100%|██████████| 10/10 [00:03<00:00, 2.98it/s, loss=0.00108, loss_rel_change=-0.273]\n" + ] + } + ], "source": [ "## ALL TESTS\n", "\n", @@ -763,7 +688,7 @@ ], "metadata": { "kernelspec": { - "display_name": "pe", + "display_name": "policyengine-us-data", "language": "python", "name": "python3" }, @@ -777,7 +702,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.11" + "version": "3.11.13" } }, "nbformat": 4, From 734f54f4325278996d9090df2cb896417581179d Mon Sep 17 00:00:00 2001 From: eccuraa Date: Tue, 15 Jul 2025 14:01:32 -0400 Subject: [PATCH 45/58] current testing arena for Ben --- test_minimization_approach.ipynb | 665 +++++++++++++++++-------------- 1 file changed, 374 insertions(+), 291 deletions(-) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 972bd0b7..5407c3ea 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 66, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -46,6 +46,14 @@ "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change: 24786" ] }, + { + "cell_type": "markdown", + "id": "e99994d3", + "metadata": {}, + "source": [ + "# Enhanced_CPS_2024.py Approaches" + ] + }, { "cell_type": "code", "execution_count": null, @@ -58,7 +66,82 @@ "text": [ "100%|██████████| 10/10 [00:03<00:00, 3.00it/s, loss=9.1e-5, loss_rel_change=-0.809] \n", "100%|██████████| 10/10 [00:03<00:00, 2.96it/s, loss=0.000181, loss_rel_change=-0.679]\n", - "100%|██████████| 10/10 [00:03<00:00, 2.98it/s, loss=0.00108, loss_rel_change=-0.273]\n" + "100%|██████████| 10/10 [00:03<00:00, 2.98it/s, loss=0.00108, loss_rel_change=-0.273]\n", + "100%|██████████| 10/10 [00:02<00:00, 3.59it/s, loss=0.0101, loss_rel_change=-0.0377]\n", + "100%|██████████| 10/10 [00:02<00:00, 3.46it/s, loss=0.1, loss_rel_change=-0.00391]\n", + "100%|██████████| 10/10 [00:02<00:00, 3.52it/s, loss=0.000191, loss_rel_change=-0.672]\n", + "100%|██████████| 10/10 [00:03<00:00, 2.89it/s, loss=0.00116, loss_rel_change=-0.274]\n", + "100%|██████████| 10/10 [00:02<00:00, 3.57it/s, loss=0.00978, loss_rel_change=-0.166]\n", + "100%|██████████| 10/10 [00:02<00:00, 3.34it/s, loss=0.0881, loss_rel_change=-0.22]\n", + "100%|██████████| 10/10 [00:02<00:00, 3.55it/s, loss=0.866, loss_rel_change=-0.23]\n", + "100%|██████████| 10/10 [00:03<00:00, 3.31it/s, loss=9.12e-5, loss_rel_change=-0.812]\n", + "100%|██████████| 10/10 [00:03<00:00, 3.26it/s, loss=0.00018, loss_rel_change=-0.687]\n", + "100%|██████████| 10/10 [00:02<00:00, 3.47it/s, loss=0.00108, loss_rel_change=-0.263]\n", + "100%|██████████| 10/10 [00:03<00:00, 3.21it/s, loss=0.0101, loss_rel_change=-0.0373]\n", + "100%|██████████| 10/10 [00:02<00:00, 3.37it/s, loss=0.1, loss_rel_change=-0.00383]\n", + "100%|██████████| 10/10 [00:03<00:00, 3.28it/s, loss=0.00389, loss_rel_change=-0.875]\n", + "100%|██████████| 10/10 [00:03<00:00, 3.17it/s, loss=0.0328, loss_rel_change=-0.894]\n", + "100%|██████████| 10/10 [00:03<00:00, 2.72it/s, loss=0.321, loss_rel_change=-0.896]\n", + "100%|██████████| 10/10 [00:02<00:00, 3.49it/s, loss=3.21, loss_rel_change=-0.896]\n", + "100%|██████████| 10/10 [00:02<00:00, 3.37it/s, loss=32.1, loss_rel_change=-0.896]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== CALIBRATION LOG DEBUG ===\n", + "File path: /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5\n", + "Epoch: 0\n", + "Number of households: 41310\n", + "Total weight: 12764381616743.21\n", + "Weight range: 0.54 to 1303728.75\n", + "Loss matrix shape: (41310, 2813)\n", + "Number of targets: 2813\n", + "After filtering bad targets:\n", + "Loss matrix clean shape: (41310, 2805)\n", + "Number of clean targets: 2805\n", + "Estimates shape: (2805,)\n", + "Estimates sum: 324584770671300.88\n", + "First 3 estimates: nation/irs/adjusted gross income/total/AGI in -inf-inf/taxable/All 1.498784e+13\n", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/All 1.609638e+10\n", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/All 6.707770e+10\n", + "dtype: float64\n", + "First 3 targets: [1.62972204e+13 1.68634879e+10 6.76819729e+10]\n", + "Mean absolute error: 17235490830.73\n", + "Mean relative error: 0.0997\n", + "=== END DEBUG ===\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250/250 [01:38<00:00, 2.54it/s, loss=3.62e-5, loss_rel_change=-0.301]\n", + "100%|██████████| 250/250 [01:35<00:00, 2.62it/s, loss=3.58e-5, loss_rel_change=-0.294]\n", + "100%|██████████| 250/250 [01:33<00:00, 2.68it/s, loss=3.34e-5, loss_rel_change=-0.376]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Weight relative change: 99.95%\n", + "Re-calibrating final selected households...\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'numpy.ndarray' object has no attribute 'columns'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 92\u001b[0m\n\u001b[1;32m 90\u001b[0m output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 91\u001b[0m output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 92\u001b[0m \u001b[43mminimise_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 93\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 94\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 95\u001b[0m \u001b[43m \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 96\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 97\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimise.py:430\u001b[0m, in \u001b[0;36mminimise_dataset\u001b[0;34m(dataset, output_path, minimization_function, **kwargs)\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[38;5;66;03m# Re-calibrate the final selected households to hit targets\u001b[39;00m\n\u001b[1;32m 429\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRe-calibrating final selected households...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 430\u001b[0m calibrated_weights \u001b[38;5;241m=\u001b[39m \u001b[43mreweight\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[43minitial_weights\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mloss_matrix_clean\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Convert to numpy array\u001b[39;49;00m\n\u001b[1;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mtargets_clean\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m250\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Reduced epochs for faster processing\u001b[39;49;00m\n\u001b[1;32m 435\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 436\u001b[0m sim\u001b[38;5;241m.\u001b[39mset_input(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m2024\u001b[39m, calibrated_weights)\n\u001b[1;32m 437\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFinal calibration completed successfully\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py:47\u001b[0m, in \u001b[0;36mreweight\u001b[0;34m(original_weights, loss_matrix, targets_array, dropout_rate, epochs, log_path, penalty_approach, penalty_weight)\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mreweight\u001b[39m(\n\u001b[1;32m 38\u001b[0m original_weights,\n\u001b[1;32m 39\u001b[0m loss_matrix,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 45\u001b[0m penalty_weight\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 46\u001b[0m ):\n\u001b[0;32m---> 47\u001b[0m target_names \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray(\u001b[43mloss_matrix\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m)\n\u001b[1;32m 48\u001b[0m is_national \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnation/\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 49\u001b[0m loss_matrix \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(loss_matrix\u001b[38;5;241m.\u001b[39mvalues, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n", + "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'" ] } ], @@ -66,7 +149,6 @@ "## ALL TESTS\n", "\n", "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", - "\n", "input_dataset = ExtendedCPS_2024\n", "\n", "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", @@ -113,7 +195,7 @@ " log_path=cal_log_path, \n", " penalty_approach=approach,\n", " penalty_weight=penalty_weight, \n", - " epochs=10, # Reduced epochs for faster processing\n", + " epochs=250, # Reduced epochs for faster processing\n", " )\n", " data[\"household_weight\"][year] = optimised_weights\n", "\n", @@ -121,9 +203,83 @@ " with h5py.File(h5_path, \"w\") as f:\n", " for variable, values in data.items():\n", " for year, value in values.items():\n", - " f.create_dataset(f\"{variable}/{year}\", data=value)\n", - "\n", - "\n", + " f.create_dataset(f\"{variable}/{year}\", data=value)" + ] + }, + { + "cell_type": "markdown", + "id": "69ff392d", + "metadata": {}, + "source": [ + "# Minimise.py approaches" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "aeab67b3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== CALIBRATION LOG DEBUG ===\n", + "File path: /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5\n", + "Epoch: 0\n", + "Number of households: 41310\n", + "Total weight: 12764381616743.21\n", + "Weight range: 0.54 to 1303728.75\n", + "Loss matrix shape: (41310, 2813)\n", + "Number of targets: 2813\n", + "After filtering bad targets:\n", + "Loss matrix clean shape: (41310, 2805)\n", + "Number of clean targets: 2805\n", + "Estimates shape: (2805,)\n", + "Estimates sum: 324584770671300.88\n", + "First 3 estimates: nation/irs/adjusted gross income/total/AGI in -inf-inf/taxable/All 1.498784e+13\n", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/All 1.609638e+10\n", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/All 6.707770e+10\n", + "dtype: float64\n", + "First 3 targets: [1.62972204e+13 1.68634879e+10 6.76819729e+10]\n", + "Mean absolute error: 17235490830.73\n", + "Mean relative error: 0.0997\n", + "=== END DEBUG ===\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250/250 [03:38<00:00, 1.14it/s, loss=3.34e-5, loss_rel_change=-0.357]\n", + "100%|██████████| 250/250 [02:39<00:00, 1.57it/s, loss=3.52e-5, loss_rel_change=-0.334]\n", + "100%|██████████| 250/250 [01:32<00:00, 2.70it/s, loss=3.39e-5, loss_rel_change=-0.34] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Weight relative change: 99.95%\n", + "Re-calibrating final selected households...\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'numpy.ndarray' object has no attribute 'columns'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[67], line 31\u001b[0m\n\u001b[1;32m 29\u001b[0m output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 30\u001b[0m output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 31\u001b[0m \u001b[43mminimise_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 32\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 33\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 34\u001b[0m \u001b[43m \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 35\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 36\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimise.py:430\u001b[0m, in \u001b[0;36mminimise_dataset\u001b[0;34m(dataset, output_path, minimization_function, **kwargs)\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[38;5;66;03m# Re-calibrate the final selected households to hit targets\u001b[39;00m\n\u001b[1;32m 429\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRe-calibrating final selected households...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 430\u001b[0m calibrated_weights \u001b[38;5;241m=\u001b[39m \u001b[43mreweight\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[43minitial_weights\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mloss_matrix_clean\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Convert to numpy array\u001b[39;49;00m\n\u001b[1;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mtargets_clean\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Reduced epochs for faster processing\u001b[39;49;00m\n\u001b[1;32m 435\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 436\u001b[0m sim\u001b[38;5;241m.\u001b[39mset_input(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m2024\u001b[39m, calibrated_weights)\n\u001b[1;32m 437\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFinal calibration completed successfully\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py:47\u001b[0m, in \u001b[0;36mreweight\u001b[0;34m(original_weights, loss_matrix, targets_array, dropout_rate, epochs, log_path, penalty_approach, penalty_weight)\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mreweight\u001b[39m(\n\u001b[1;32m 38\u001b[0m original_weights,\n\u001b[1;32m 39\u001b[0m loss_matrix,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 45\u001b[0m penalty_weight\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 46\u001b[0m ):\n\u001b[0;32m---> 47\u001b[0m target_names \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray(\u001b[43mloss_matrix\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m)\n\u001b[1;32m 48\u001b[0m is_national \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnation/\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 49\u001b[0m loss_matrix \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(loss_matrix\u001b[38;5;241m.\u001b[39mvalues, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n", + "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'" + ] + } + ], + "source": [ "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", "\n", "files = [\n", @@ -137,10 +293,10 @@ "\n", "optional_params = {\n", " \"random_sampling_minimization\": {\n", - " \"target_fractions\": [0.5, 0.6, 0.7, 0.8, 0.9], # fractions of the dataset to keep\n", + " \"target_fractions\": [0.5, 0.6]#, 0.7, 0.8, 0.9], # fractions of the dataset to keep\n", " },\n", " \"candidate_loss_contribution\": {\n", - " \"loss_rel_change_max\": [0.001, 0.0001, 0.00001, 0.000001, 0.0000001] # maximum relative change in loss\n", + " \"loss_rel_change_max\": [0.001, 0.0001]#, 0.00001, 0.000001, 0.0000001] # maximum relative change in loss\n", " }\n", "}\n", "\n", @@ -173,250 +329,22 @@ ] }, { - "cell_type": "code", - "execution_count": 22, - "id": "35892c9d", + "cell_type": "markdown", + "id": "fa1ea957", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n", - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n", - "Weight relative change: 100.00%\n", - "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/random_sampling_minimization/1.0_enhanced_cps_2024_random_sampling_minimization_minimised.h5\n", - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n" - ] - } - ], "source": [ - "## SMALL CHECKS BELOW -- IGNORE ---\n", - "\n", - "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", - "\n", - "files = [\n", - " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", - " ]\n", - "\n", - "minimization_function = random_sampling_minimization\n", - "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", - "\n", - "for file in files:\n", - " output_path = STORAGE_FOLDER / \"random_sampling_minimization\" / f\"{1.0}_enhanced_cps_2024_random_sampling_minimization_minimised.h5\"\n", - " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - " minimise_dataset(\n", - " file,\n", - " output_path,\n", - " minimization_function=minimization_function, \n", - " target_fractions=[1.0]\n", - " )" + "### (Temporary) Cleaning of data (removing weights smaller than epsilon)" ] }, { "cell_type": "code", "execution_count": null, - "id": "b4cf8e89", + "id": "e88df261", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250/250 [01:24<00:00, 2.98it/s, loss=3.37e-5, loss_rel_change=-0.92] \n" - ] - } - ], + "outputs": [], "source": [ - "input_dataset = ExtendedCPS_2024\n", - "\n", - "sim = Microsimulation(dataset=input_dataset)\n", - "data = sim.dataset.load_dataset()\n", - "data[\"household_weight\"] = {}\n", - "original_weights = sim.calculate(\"household_weight\")\n", - "original_weights = original_weights.values + np.random.normal(\n", - " 1, 0.1, len(original_weights)\n", - ")\n", - "for year in range(2024, 2025):\n", - " loss_matrix, targets_array = build_loss_matrix(\n", - " input_dataset, year\n", - " )\n", - "\n", - " bad_mask = loss_matrix.columns.isin(bad_targets)\n", - " keep_mask_bool = ~bad_mask\n", - " keep_idx = np.where(keep_mask_bool)[0]\n", - " loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n", - " targets_array_clean = targets_array[keep_idx]\n", - " assert loss_matrix_clean.shape[1] == targets_array_clean.size\n", - " assert loss_matrix_clean.shape[1] != loss_matrix.shape[1]\n", - "\n", - " optimised_weights = reweight(\n", - " original_weights,\n", - " loss_matrix_clean,\n", - " targets_array_clean,\n", - " log_path=\"baseline_calibration_log.csv\",\n", - " epochs=250, # Reduced epochs for faster processing\n", - " )\n", - " data[\"household_weight\"][year] = optimised_weights\n", - "\n", - "output_path = STORAGE_FOLDER / \"baseline\" / \"enhanced_cps_2024_baseline.h5\"\n", - "output_path.parent.mkdir(parents=True, exist_ok=True)\n", - "\n", - "# Save to HDF5 file\n", - "with h5py.File(output_path, \"w\") as f:\n", - " for variable, values in data.items():\n", - " for year, value in values.items():\n", - " f.create_dataset(f\"{variable}/{year}\", data=value)" + "## this should go in the enhanced_cps_2024.py file, because household removal doesn't happen there\n", + "# Need to check Ben's PR." ] }, { @@ -429,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 62, "id": "225debd8", "metadata": {}, "outputs": [ @@ -477,16 +405,31 @@ "0 none none 41310 0.0069" ] }, - "execution_count": 33, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", + "Creating scoring of loss\n", "Creating dataframe to store regularization results\n", "\"\"\"\n", "\n", + "# Calculate quality categories\n", + "def loss_score(calibration_log):\n", + " excellent_count = (\n", + " calibration_log[\"rel_abs_error\"] < 0.05).sum() # < 5% error\n", + " good_count = (\n", + " (calibration_log[\"rel_abs_error\"] >= 0.05)\n", + " & (calibration_log[\"rel_abs_error\"] < 0.20)).sum() # 5-20% error\n", + " total_targets = len(calibration_log)\n", + " # Calculate quality score\n", + " quality_score = (excellent_count * 100 + good_count * 75) / total_targets\n", + " return quality_score\n", + "\n", + "\n", + "\n", "# Initial dataframe setup\n", "reg_results_df = pd.DataFrame({\n", " 'strategy': ['none'],\n", @@ -499,8 +442,8 @@ " new_rows = pd.DataFrame({\n", " 'strategy': strategy, \n", " 'parameter': parameter, \n", - " 'dataset_size': dataset_size,\n", - " 'total_loss': total_loss\n", + " 'dataset_size': [dataset_size],\n", + " 'total_loss': [total_loss]\n", " })\n", " return pd.concat([reg_results_df, new_rows], ignore_index=True)\n", "\n", @@ -511,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 63, "id": "7bb3ef3c", "metadata": {}, "outputs": [ @@ -548,66 +491,210 @@ " none\n", " none\n", " 41310\n", - " 0.006900\n", + " 0.0069\n", " \n", " \n", " 1\n", - " l0_exp\n", - " 0.01\n", + " l0_sigmoid\n", + " 1.0\n", " 41310\n", - " 1263.410322\n", + " 0.0069\n", " \n", " \n", " 2\n", - " l0_exp\n", + " l0_sigmoid\n", " 0.1\n", " 41310\n", - " 1263.410322\n", + " 39.2959\n", " \n", " \n", " 3\n", + " l0_sigmoid\n", + " 0.01\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 4\n", + " l0_sigmoid\n", + " 0.001\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 5\n", + " l0_sigmoid\n", + " 0.0001\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 6\n", + " l0_sigmoid\n", + " 0.00001\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 7\n", + " l0_log\n", + " 1.0\n", + " 41310\n", + " 0.0069\n", + " \n", + " \n", + " 8\n", + " l0_log\n", + " 0.1\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 9\n", + " l0_log\n", + " 0.01\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 10\n", + " l0_log\n", + " 0.001\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 11\n", + " l0_log\n", + " 0.0001\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 12\n", + " l0_log\n", + " 0.00001\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 13\n", + " l0_exp\n", + " 1.0\n", + " 41310\n", + " 0.0069\n", + " \n", + " \n", + " 14\n", " l0_exp\n", " 0.1\n", " 41310\n", - " 1263.410322\n", + " 39.2959\n", " \n", " \n", - " 4\n", - " l1\n", + " 15\n", + " l0_exp\n", " 0.01\n", " 41310\n", - " 1263.410322\n", + " 39.2959\n", " \n", " \n", - " 5\n", + " 16\n", + " l0_exp\n", + " 0.001\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 17\n", + " l0_exp\n", + " 0.0001\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 18\n", + " l0_exp\n", + " 0.00001\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 19\n", " l1\n", - " 0.1\n", + " 1.0\n", " 41310\n", - " 1263.410322\n", + " 0.0069\n", " \n", " \n", - " 6\n", + " 20\n", " l1\n", " 0.1\n", " 41310\n", - " 1263.410322\n", + " 39.2959\n", + " \n", + " \n", + " 21\n", + " l1\n", + " 0.01\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 22\n", + " l1\n", + " 0.001\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 23\n", + " l1\n", + " 0.0001\n", + " 41310\n", + " 39.2959\n", + " \n", + " \n", + " 24\n", + " l1\n", + " 0.00001\n", + " 41310\n", + " 39.2959\n", " \n", " \n", "\n", "" ], "text/plain": [ - " strategy parameter dataset_size total_loss\n", - "0 none none 41310 0.006900\n", - "1 l0_exp 0.01 41310 1263.410322\n", - "2 l0_exp 0.1 41310 1263.410322\n", - "3 l0_exp 0.1 41310 1263.410322\n", - "4 l1 0.01 41310 1263.410322\n", - "5 l1 0.1 41310 1263.410322\n", - "6 l1 0.1 41310 1263.410322" + " strategy parameter dataset_size total_loss\n", + "0 none none 41310 0.0069\n", + "1 l0_sigmoid 1.0 41310 0.0069\n", + "2 l0_sigmoid 0.1 41310 39.2959\n", + "3 l0_sigmoid 0.01 41310 39.2959\n", + "4 l0_sigmoid 0.001 41310 39.2959\n", + "5 l0_sigmoid 0.0001 41310 39.2959\n", + "6 l0_sigmoid 0.00001 41310 39.2959\n", + "7 l0_log 1.0 41310 0.0069\n", + "8 l0_log 0.1 41310 39.2959\n", + "9 l0_log 0.01 41310 39.2959\n", + "10 l0_log 0.001 41310 39.2959\n", + "11 l0_log 0.0001 41310 39.2959\n", + "12 l0_log 0.00001 41310 39.2959\n", + "13 l0_exp 1.0 41310 0.0069\n", + "14 l0_exp 0.1 41310 39.2959\n", + "15 l0_exp 0.01 41310 39.2959\n", + "16 l0_exp 0.001 41310 39.2959\n", + "17 l0_exp 0.0001 41310 39.2959\n", + "18 l0_exp 0.00001 41310 39.2959\n", + "19 l1 1.0 41310 0.0069\n", + "20 l1 0.1 41310 39.2959\n", + "21 l1 0.01 41310 39.2959\n", + "22 l1 0.001 41310 39.2959\n", + "23 l1 0.0001 41310 39.2959\n", + "24 l1 0.00001 41310 39.2959" ] }, - "execution_count": 34, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -617,21 +704,19 @@ "Pulling values from created calibration_log.csv and .h5 files to populate the line plot dataframe\n", "\n", "( I need to pull the strategy (folder name), parameter (from file title??), dataset size (from length of .h5 file), and total loss (from sum of loss column in calibration_log_file.csv))\n", - "\"\"\"\n", "\n", "approaches = [\"l0_exp\", \"l1\"] \n", "penalty_weights = [1e-2, 1e-1]\n", - "\n", - "def get_output_path(approach, file_name):\n", - " output_path = STORAGE_FOLDER / approach / file_name\n", - " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - " return output_path\n", + "\"\"\"\n", + "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", + "penalty_weights = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]\n", + "og_size = 41310 # Original size of the dataset\n", + "og_loss = 6.9e-3 # Original loss from the baseline dataset\n", "\n", "for approach in approaches:\n", - " total_size = []\n", - " total_loss = []\n", + " strategy = approach\n", + " reg_results_df = add_result(reg_results_df, strategy, 1.0, og_size, og_loss)\n", " for penalty_weight in penalty_weights:\n", - " strategy = approach\n", " parameter = penalty_weight\n", "\n", " # Pull length of .h5 file\n", @@ -639,17 +724,15 @@ " h5_path = get_output_path(strategy, h5_name)\n", " # see if this works\n", " dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n", - " total_size.append(dataset_size)\n", + " #total_size.append(dataset_size)\n", "\n", " # Pull sum of loss column\n", - " cal_log_name = f\"calibration_log_{approach}_{penalty_weight}.csv\"\n", - " cal_log_path = get_output_path(approach, cal_log_name)\n", - " loss_sum = pd.read_csv(cal_log_path)['loss'].sum()\n", - " total_loss.append(loss_sum)\n", - "\n", - " reg_results_df = add_result(reg_results_df, strategy, parameter, total_size, total_loss)\n", - " # does this weird recursion work?\n", - "\n", + " cal_log_name = f\"calibration_log_{strategy}_{parameter}.csv\"\n", + " cal_log_path = get_output_path(strategy, cal_log_name)\n", + " calibration_log = pd.read_csv(cal_log_path)\n", + " loss_value = loss_score(calibration_log)\n", + " \n", + " reg_results_df = add_result(reg_results_df, strategy, parameter, dataset_size, loss_value)\n", "\n", "\n", "\n", From 64c81498fd314b922fe04185724bccb0dbaa8524 Mon Sep 17 00:00:00 2001 From: eccuraa Date: Tue, 15 Jul 2025 15:02:16 -0400 Subject: [PATCH 46/58] not much new --- test_minimization_approach.ipynb | 43 ++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 5407c3ea..ea561155 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -767,6 +767,49 @@ "source": [ "## Plotting" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9602953a", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "Synthetic dataset\n", + "'''\n", + "\n", + "# Define values\n", + "strategies = ['l0_sigmoid', 'l0_log', 'l0_exp', 'l1']\n", + "parameters = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001]\n", + "\n", + "# Synthetic values\n", + "base_size = 41310\n", + "min_size = 20000\n", + "base_loss = 0.0069\n", + "max_loss = 40.0\n", + "\n", + "# Construct rows\n", + "rows = [{'strategy': 'none', 'parameter': 'none', 'dataset_size': base_size, 'total_loss': base_loss}]\n", + "\n", + "for strategy in strategies:\n", + " for i, param in enumerate(parameters):\n", + " # Gradually decrease size and increase loss\n", + " size = int(base_size - (base_size - min_size) * (i / (len(parameters) - 1)))\n", + " loss = round(base_loss + (max_loss - base_loss) * (i / (len(parameters) - 1)), 4)\n", + " rows.append({\n", + " 'strategy': strategy,\n", + " 'parameter': param,\n", + " 'dataset_size': size,\n", + " 'total_loss': loss\n", + " })\n", + "\n", + "# Create DataFrame\n", + "reg_results_df = pd.DataFrame(rows)\n", + "\n", + "# Display\n", + "print(reg_results_df)" + ] } ], "metadata": { From 226b2d91d725f881887c67b55f0bba7f67ec4ada Mon Sep 17 00:00:00 2001 From: eccuraa Date: Tue, 15 Jul 2025 15:06:17 -0400 Subject: [PATCH 47/58] synthetic dataset --- policyengine_us_data/datasets/cps/enhanced_cps.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 851ea464..3da4f571 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -42,8 +42,7 @@ def reweight( epochs=500, log_path="calibration_log.csv", penalty_approach=None, - penalty_weight=None, - + penalty_weight=None, ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -62,7 +61,11 @@ def reweight( ) # TO DO: replace this with a call to the python reweight.py package. - def loss(weights, penalty_approach=penalty_approach, penalty_weight=penalty_weight): + def loss( + weights, + penalty_approach=penalty_approach, + penalty_weight=penalty_weight, + ): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -86,7 +89,6 @@ def loss(weights, penalty_approach=penalty_approach, penalty_weight=penalty_weig epsilon = 1e-3 # Threshold for "near zero" - # Option 1: Sigmoid approximation if penalty_approach == "l0_sigmoid": smoothed_l0 = torch.sigmoid( @@ -103,14 +105,11 @@ def loss(weights, penalty_approach=penalty_approach, penalty_weight=penalty_weig if penalty_approach == "l0_exp": smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() - if penalty_approach == "l1": l1 = torch.mean(weights) return rel_error_normalized.mean() + penalty_weight * l1 - return ( - rel_error_normalized.mean() + penalty_weight * smoothed_l0 - ) + return rel_error_normalized.mean() + penalty_weight * smoothed_l0 else: return rel_error_normalized.mean() From 6a8160b93cc7b7cc4d794df0c2451ddf8465aa63 Mon Sep 17 00:00:00 2001 From: eccuraa Date: Tue, 15 Jul 2025 15:06:56 -0400 Subject: [PATCH 48/58] committing before changing file --- policyengine_us_data/utils/minimise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index b3e0ed1a..fc86b14d 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -192,7 +192,7 @@ def get_loss_from_mask( included_weights, included_estimate_matrix, targets, - epochs=250, + epochs=10, ) # Put calibrated weights back into full array @@ -431,7 +431,7 @@ def minimise_dataset( initial_weights, loss_matrix_clean.values, # Convert to numpy array targets_clean, - epochs=250, # Reduced epochs for faster processing + epochs=10, # Reduced epochs for faster processing ) sim.set_input("household_weight", 2024, calibrated_weights) print("Final calibration completed successfully") From 842dfa6d3c34d35cccfbe62fd6141d459e217df7 Mon Sep 17 00:00:00 2001 From: eccuraa Date: Tue, 15 Jul 2025 15:10:59 -0400 Subject: [PATCH 49/58] Merge minimize.py from maria/ecps_minimization branch --- policyengine_us_data/utils/minimize.py | 444 +++++++++++++++++++++++++ 1 file changed, 444 insertions(+) create mode 100644 policyengine_us_data/utils/minimize.py diff --git a/policyengine_us_data/utils/minimize.py b/policyengine_us_data/utils/minimize.py new file mode 100644 index 00000000..ce2c6fdf --- /dev/null +++ b/policyengine_us_data/utils/minimize.py @@ -0,0 +1,444 @@ +from policyengine_us_data.utils.loss import build_loss_matrix +from policyengine_core.data import Dataset +from policyengine_us import Microsimulation +import numpy as np +import pandas as pd +import h5py +from policyengine_us_data.storage import STORAGE_FOLDER +from typing import Optional, Callable + +bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", +] + + +def create_calibration_log_file(file_path, epoch=0): + dataset = Dataset.from_file(file_path) + sim = Microsimulation(dataset=dataset) + + loss_matrix, targets = build_loss_matrix(dataset, 2024) + + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + + assert loss_matrix_clean.shape[1] == targets_clean.size + + estimates = ( + sim.calculate("household_weight", 2024).values @ loss_matrix_clean + ) + target_names = loss_matrix_clean.columns + + # Calculate and print some key metrics + errors = estimates - targets_clean + rel_errors = errors / targets_clean + + df = pd.DataFrame( + { + "target_name": target_names, + "estimate": estimates, + "target": targets_clean, + } + ) + df["epoch"] = epoch + df["error"] = df["estimate"] - df["target"] + df["rel_error"] = df["error"] / df["target"] + df["abs_error"] = df["error"].abs() + df["rel_abs_error"] = ( + df["abs_error"] / df["target"].abs() + if df["target"].abs().sum() > 0 + else np.nan + ) + df["loss"] = (df["rel_error"] ** 2).mean() + + df.to_csv( + str(file_path).replace(".h5", "_calibration_log.csv"), index=False + ) + + +def losses_for_candidates( + base_weights: np.ndarray, + idxs: np.ndarray, + est_mat: np.ndarray, + targets: np.ndarray, + norm: np.ndarray, + chunk_size: Optional[int] = 25_000, +) -> np.ndarray: + """ + Return the loss value *for each* candidate deletion in `idxs` + in one matrix multiplication. + + Parameters + ---------- + base_weights : (n,) original weight vector + idxs : (k,) candidate row indices to zero-out + est_mat : (n, m) estimate matrix + targets : (m,) calibration targets + norm : (m,) normalisation factors + chunk_size : max number of candidates to process at once + + Returns + ------- + losses : (k,) loss if row i were removed (and weights rescaled) + """ + W = base_weights + total = W.sum() + k = len(idxs) + losses = np.empty(k, dtype=float) + + # Work through the candidate list in blocks + for start in range(0, k, chunk_size): + stop = min(start + chunk_size, k) + part = idxs[start:stop] # (p,) where p ≤ chunk_size + p = len(part) + + # Build the delta matrix only for this chunk + delta = np.zeros((p, len(W))) + delta[np.arange(p), part] = -W[part] + + keep_total = total + delta.sum(axis=1) # (p,) + delta *= (total / keep_total)[:, None] + + # Matrix–matrix multiply → one matrix multiplication per chunk + ests = (W + delta) @ est_mat # (p, m) + rel_err = ((ests - targets) + 1) / (targets + 1) + losses[start:stop] = ((rel_err * norm) ** 2).mean(axis=1) + + return losses + + +def get_loss_from_mask( + weights, inclusion_mask, estimate_matrix, targets, normalisation_factor +): + """ + Calculate the loss based on the inclusion mask and the estimate matrix. + """ + # Step 1: Apply mask and rescale weights + masked_weights = weights.copy() + original_weight_total = masked_weights.sum() + if (~inclusion_mask).sum() > 0: + masked_weights[~inclusion_mask] = 0 + masked_weight_total = masked_weights.sum() + masked_weights[inclusion_mask] *= ( + original_weight_total / masked_weight_total + ) + + # Step 2: Re-calibrate the masked weights to hit targets + # Only calibrate the included households + included_weights = masked_weights[inclusion_mask] + included_estimate_matrix = estimate_matrix.iloc[ + inclusion_mask + ] # Keep as DataFrame + + # Call reweight function to calibrate the selected households + from policyengine_us_data.datasets.cps.enhanced_cps import reweight + + calibrated_weights_included = reweight( + included_weights, + included_estimate_matrix, + targets, + epochs=250, + ) + + # Put calibrated weights back into full array + calibrated_weights = np.zeros_like(masked_weights) + calibrated_weights[inclusion_mask] = calibrated_weights_included + + # Calculate estimates and loss from calibrated weights + estimates = calibrated_weights @ estimate_matrix + rel_error = ((estimates - targets) + 1) / (targets + 1) + loss = ((rel_error * normalisation_factor) ** 2).mean() + + return loss + + +def candidate_loss_contribution( + weights: np.ndarray, + estimate_matrix: np.ndarray, + targets: np.ndarray, + normalisation_factor: np.ndarray, + loss_rel_change_max: float, + count_iterations: int = 5, + view_fraction_per_iteration: float = 0.5, + fraction_remove_per_iteration: float = 0.05, +) -> np.ndarray: + """ + Minimization approach based on candidate loss contribution. + + This function iteratively removes households that contribute least to the loss, + maintaining the calibration quality within the specified tolerance. + + Parameters + ---------- + weights : (n,) household weights + estimate_matrix : (n, m) matrix mapping weights to estimates + targets : (m,) calibration targets + normalisation_factor : (m,) normalisation factors for different targets + loss_rel_change_max : maximum allowed relative change in loss + count_iterations : number of iterations to perform + view_fraction_per_iteration : fraction of households to evaluate each iteration + fraction_remove_per_iteration : fraction of households to remove each iteration + + Returns + ------- + inclusion_mask : (n,) boolean mask of households to keep + """ + from tqdm import tqdm + + full_mask = np.ones_like(weights, dtype=bool) + + for i in range(count_iterations): + inclusion_mask = full_mask.copy() + baseline_loss = get_loss_from_mask( + weights, + inclusion_mask, + estimate_matrix, + targets, + normalisation_factor, + ) + + # Sample only households that are currently included + indices = np.random.choice( + np.where(full_mask)[0], + size=int(full_mask.sum() * view_fraction_per_iteration), + replace=False, + ) + # 2. compute losses for the batch in one shot + candidate_losses = losses_for_candidates( + weights, indices, estimate_matrix, targets, normalisation_factor + ) + # 3. convert to relative change vs. baseline + household_loss_rel_changes = ( + candidate_losses - baseline_loss + ) / baseline_loss + + inclusion_mask = full_mask.copy() + household_loss_rel_changes = np.array(household_loss_rel_changes) + # Sort by the relative change in loss + sorted_indices = np.argsort(household_loss_rel_changes) + + # Remove the worst households + num_to_remove = int(len(weights) * fraction_remove_per_iteration) + worst_indices = indices[sorted_indices[:num_to_remove]] + inclusion_mask[worst_indices] = False + + # Calculate the new loss + new_loss = get_loss_from_mask( + weights, + inclusion_mask, + estimate_matrix, + targets, + normalisation_factor, + ) + rel_change = (new_loss - baseline_loss) / baseline_loss + + if rel_change > loss_rel_change_max: + print( + f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, " + f"which is too high ({rel_change:.2%}). Stopping." + ) + break + + print( + f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}" + ) + print( + f"Removed {num_to_remove} households with worst relative loss changes." + ) + + # Update the full mask + full_mask &= inclusion_mask + + return full_mask + + +def random_sampling_minimization( + weights, + estimate_matrix, + targets, + normalisation_factor, + random=True, + target_fractions=[0.5, 0.6, 0.7, 0.8, 0.9], +): + """A simple random sampling approach""" + n = len(weights) + + household_weights_normalized = weights / weights.sum() + + final_mask = None + lowest_loss = float("inf") + for fraction in target_fractions: + target_size = int(n * fraction) + # Random sampling with multiple attempts + best_mask = None + best_loss = float("inf") + + for _ in range(3): # Try 3 random samples + mask = np.zeros(n, dtype=bool) + mask[ + np.random.choice( + n, + target_size, + p=household_weights_normalized if random else None, + replace=False, + ) + ] = True + + loss = get_loss_from_mask( + weights, mask, estimate_matrix, targets, normalisation_factor + ) + + if loss < best_loss: + best_loss = loss + best_mask = mask + + if lowest_loss > best_loss: + lowest_loss = best_loss + final_mask = best_mask + + return final_mask + + +def minimize_dataset( + dataset, + output_path: str, + minimization_function: Callable = candidate_loss_contribution, + loss_matrix: Optional[pd.DataFrame] = None, + targets: Optional[np.ndarray] = None, + **kwargs, +) -> None: + """ + Main function to minimize a dataset using a specified minimization approach. + + Parameters + ---------- + dataset : path to the dataset file or Dataset object + output_path : path where the minimized dataset will be saved + loss_rel_change_max : maximum allowed relative change in loss + minimization_function : function that implements the minimization logic + **kwargs : additional arguments to pass to the minimization function + """ + # Handle both dataset class and file path + if hasattr(dataset, "file_path"): + dataset_path = str(dataset.file_path) + else: + dataset_path = str(dataset) + + create_calibration_log_file(dataset_path) + + dataset = Dataset.from_file(dataset_path) + if loss_matrix is None or targets is None: + loss_matrix, targets = build_loss_matrix(dataset, 2024) + + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + assert loss_matrix_clean.shape[1] == targets_clean.size + else: + loss_matrix_clean = loss_matrix + targets_clean = targets + + sim = Microsimulation(dataset=dataset) + + weights = sim.calculate("household_weight", 2024).values + is_national = loss_matrix_clean.columns.str.startswith("nation/") + nation_normalisation_factor = is_national * (1 / is_national.sum()) + state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) + normalisation_factor = np.where( + is_national, nation_normalisation_factor, state_normalisation_factor + ) + + # Call the minimization function + inclusion_mask = minimization_function( + weights=weights, + estimate_matrix=loss_matrix_clean, + targets=targets_clean, + normalisation_factor=normalisation_factor, + **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. + ) + + # Extract household IDs for remaining households + household_ids = sim.calculate("household_id", 2024).values + remaining_households = household_ids[inclusion_mask] + + # Create a smaller dataset with only the remaining households + df = sim.to_input_dataframe() + smaller_df = df[df["household_id__2024"].isin(remaining_households)] + + weight_rel_change = ( + smaller_df["household_weight__2024"].sum() + / df["household_weight__2024"].sum() + ) + print(f"Weight relative change: {weight_rel_change:.2%}") + + # Create new simulation with smaller dataset + sim = Microsimulation(dataset=smaller_df) + + # Rescale weights to maintain total + initial_weights = ( + sim.calculate("household_weight", 2024).values / weight_rel_change + ) + + # Re-calibrate the final selected households to hit targets + print("Re-calibrating final selected households...") + + # Build loss matrix for the smaller dataset + smaller_loss_matrix, smaller_targets = build_loss_matrix(sim.dataset, 2024) + + # Apply same filtering as before + bad_mask = smaller_loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + smaller_loss_matrix_clean = smaller_loss_matrix.iloc[:, keep_idx] + smaller_targets_clean = smaller_targets[keep_idx] + + from policyengine_us_data.datasets.cps.enhanced_cps import reweight + + calibrated_weights = reweight( + initial_weights, + smaller_loss_matrix_clean, # Now matches the smaller dataset size + smaller_targets_clean, + epochs=250, # Reduced epochs for faster processing + ) + sim.set_input("household_weight", 2024, calibrated_weights) + print("Final calibration completed successfully") + # Prepare data for saving + data = {} + for variable in sim.input_variables: + data[variable] = {2024: sim.calculate(variable, 2024).values} + if data[variable][2024].dtype == "object": + data[variable][2024] = data[variable][2024].astype("S") + + # Save to HDF5 file + with h5py.File(output_path, "w") as f: + for variable, values in data.items(): + for year, value in values.items(): + f.create_dataset(f"{variable}/{year}", data=value) + + print(f"Saved minimised dataset to {output_path}") + create_calibration_log_file(output_path, epoch=250) + + +if __name__ == "__main__": + # Example usage + files = [ + STORAGE_FOLDER / "enhanced_cps_2024.h5", + ] + + for file in files: + output_path = file.with_name(file.stem + "_minimised.h5") + minimize_dataset( + file, + output_path, + ) From f815c7eb523991be87627f82f2514f1635292f2e Mon Sep 17 00:00:00 2001 From: eccuraa Date: Tue, 15 Jul 2025 15:12:46 -0400 Subject: [PATCH 50/58] renaming to american naming (maria started it haha) --- policyengine_us_data/utils/minimise.py | 466 ------------------------- 1 file changed, 466 deletions(-) delete mode 100644 policyengine_us_data/utils/minimise.py diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py deleted file mode 100644 index fc86b14d..00000000 --- a/policyengine_us_data/utils/minimise.py +++ /dev/null @@ -1,466 +0,0 @@ -from policyengine_us_data.utils.loss import build_loss_matrix -from policyengine_core.data import Dataset -from policyengine_us import Microsimulation -import numpy as np -import pandas as pd -import h5py -from policyengine_us_data.storage import STORAGE_FOLDER -from typing import Optional, Callable -from policyengine_us_data.datasets.cps.enhanced_cps import reweight - -bad_targets = [ - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", - "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", - "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", -] - - -def create_calibration_log_file(file_path, epoch=0): - print(f"=== CALIBRATION LOG DEBUG ===") - print(f"File path: {file_path}") - print(f"Epoch: {epoch}") - - dataset = Dataset.from_file(file_path) - sim = Microsimulation(dataset=dataset) - - # Debug: Print dataset info - household_weights = sim.calculate("household_weight", 2024) - print(f"Number of households: {len(household_weights)}") - print(f"Total weight: {household_weights.sum():.2f}") - print( - f"Weight range: {household_weights.min():.2f} to {household_weights.max():.2f}" - ) - - loss_matrix, targets = build_loss_matrix(dataset, 2024) - print(f"Loss matrix shape: {loss_matrix.shape}") - print(f"Number of targets: {len(targets)}") - - bad_mask = loss_matrix.columns.isin(bad_targets) - keep_mask_bool = ~bad_mask - keep_idx = np.where(keep_mask_bool)[0] - loss_matrix_clean = loss_matrix.iloc[:, keep_idx] - targets_clean = targets[keep_idx] - - print(f"After filtering bad targets:") - print(f"Loss matrix clean shape: {loss_matrix_clean.shape}") - print(f"Number of clean targets: {len(targets_clean)}") - - assert loss_matrix_clean.shape[1] == targets_clean.size - - estimates = ( - sim.calculate("household_weight", 2024).values @ loss_matrix_clean - ) - target_names = loss_matrix_clean.columns - - # Debug: Print estimate statistics - print(f"Estimates shape: {estimates.shape}") - print(f"Estimates sum: {estimates.sum():.2f}") - print(f"First 3 estimates: {estimates[:3]}") - print(f"First 3 targets: {targets_clean[:3]}") - - # Calculate and print some key metrics - errors = estimates - targets_clean - rel_errors = errors / targets_clean - print(f"Mean absolute error: {np.abs(errors).mean():.2f}") - print(f"Mean relative error: {np.abs(rel_errors).mean():.4f}") - print(f"=== END DEBUG ===\n") - - df = pd.DataFrame( - { - "target_name": target_names, - "estimate": estimates, - "target": targets_clean, - } - ) - df["epoch"] = epoch - df["error"] = df["estimate"] - df["target"] - df["rel_error"] = df["error"] / df["target"] - df["abs_error"] = df["error"].abs() - df["rel_abs_error"] = ( - df["abs_error"] / df["target"].abs() - if df["target"].abs().sum() > 0 - else np.nan - ) - df["loss"] = (df["rel_error"] ** 2).mean() - - df.to_csv( - str(file_path).replace(".h5", "_calibration_log.csv"), index=False - ) - - -def losses_for_candidates( - base_weights: np.ndarray, - idxs: np.ndarray, - est_mat: np.ndarray, - targets: np.ndarray, - norm: np.ndarray, - chunk_size: Optional[int] = 25_000, -) -> np.ndarray: - """ - Return the loss value *for each* candidate deletion in `idxs` - in one matrix multiplication. - - Parameters - ---------- - base_weights : (n,) original weight vector - idxs : (k,) candidate row indices to zero-out - est_mat : (n, m) estimate matrix - targets : (m,) calibration targets - norm : (m,) normalisation factors - chunk_size : max number of candidates to process at once - - Returns - ------- - losses : (k,) loss if row i were removed (and weights rescaled) - """ - W = base_weights - total = W.sum() - k = len(idxs) - losses = np.empty(k, dtype=float) - - # Work through the candidate list in blocks - for start in range(0, k, chunk_size): - stop = min(start + chunk_size, k) - part = idxs[start:stop] # (p,) where p ≤ chunk_size - p = len(part) - - # Build the delta matrix only for this chunk - delta = np.zeros((p, len(W))) - delta[np.arange(p), part] = -W[part] - - keep_total = total + delta.sum(axis=1) # (p,) - delta *= (total / keep_total)[:, None] - - # Matrix–matrix multiply → one matrix multiplication per chunk - ests = (W + delta) @ est_mat # (p, m) - rel_err = ((ests - targets) + 1) / (targets + 1) - losses[start:stop] = ((rel_err * norm) ** 2).mean(axis=1) - - return losses - - -def minimise_dataset( - dataset, output_path: str, loss_rel_change_max: float -) -> None: - dataset = str(dataset) - create_calibration_log_file(dataset) - - dataset = Dataset.from_file(dataset) - loss_matrix = build_loss_matrix(dataset, 2024) - - sim = Microsimulation(dataset=dataset) - - weights = sim.calculate("household_weight", 2024).values - estimate_matrix, targets = loss_matrix - is_national = estimate_matrix.columns.str.startswith("nation/") - nation_normalisation_factor = is_national * (1 / is_national.sum()) - state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) - normalisation_factor = np.where( - is_national, nation_normalisation_factor, state_normalisation_factor - ) - weights @ estimate_matrix - - -def get_loss_from_mask( - weights, inclusion_mask, estimate_matrix, targets, normalisation_factor -): - """ - Calculate the loss based on the inclusion mask and the estimate matrix. - """ - # Step 1: Apply mask and rescale weights - masked_weights = weights.copy() - original_weight_total = masked_weights.sum() - if (~inclusion_mask).sum() > 0: - masked_weights[~inclusion_mask] = 0 - masked_weight_total = masked_weights.sum() - masked_weights[inclusion_mask] *= ( - original_weight_total / masked_weight_total - ) - - # Step 2: Re-calibrate the masked weights to hit targets - # Only calibrate the included households - included_weights = masked_weights[inclusion_mask] - included_estimate_matrix = estimate_matrix[inclusion_mask] - - # Call reweight function to calibrate the selected households - calibrated_weights_included = reweight( - included_weights, - included_estimate_matrix, - targets, - epochs=10, - ) - - # Put calibrated weights back into full array - calibrated_weights = np.zeros_like(masked_weights) - calibrated_weights[inclusion_mask] = calibrated_weights_included - - # Calculate estimates and loss from calibrated weights - estimates = calibrated_weights @ estimate_matrix - rel_error = ((estimates - targets) + 1) / (targets + 1) - loss = ((rel_error * normalisation_factor) ** 2).mean() - - return loss - - -def candidate_loss_contribution( - weights: np.ndarray, - estimate_matrix: np.ndarray, - targets: np.ndarray, - normalisation_factor: np.ndarray, - loss_rel_change_max: float, - count_iterations: int = 5, - view_fraction_per_iteration: float = 0.3, - fraction_remove_per_iteration: float = 0.1, -) -> np.ndarray: - """ - Minimization approach based on candidate loss contribution. - - This function iteratively removes households that contribute least to the loss, - maintaining the calibration quality within the specified tolerance. - - Parameters - ---------- - weights : (n,) household weights - estimate_matrix : (n, m) matrix mapping weights to estimates - targets : (m,) calibration targets - normalisation_factor : (m,) normalisation factors for different targets - loss_rel_change_max : maximum allowed relative change in loss - count_iterations : number of iterations to perform - view_fraction_per_iteration : fraction of households to evaluate each iteration - fraction_remove_per_iteration : fraction of households to remove each iteration - - Returns - ------- - inclusion_mask : (n,) boolean mask of households to keep - """ - from tqdm import tqdm - - full_mask = np.ones_like(weights, dtype=bool) - - for i in range(count_iterations): - inclusion_mask = full_mask.copy() - baseline_loss = get_loss_from_mask( - weights, - inclusion_mask, - estimate_matrix, - targets, - normalisation_factor, - ) - - # Sample only households that are currently included - indices = np.random.choice( - np.where(full_mask)[0], - size=int(full_mask.sum() * view_fraction_per_iteration), - replace=False, - ) - # 2. compute losses for the batch in one shot - candidate_losses = losses_for_candidates( - weights, indices, estimate_matrix, targets, normalisation_factor - ) - # 3. convert to relative change vs. baseline - household_loss_rel_changes = ( - candidate_losses - baseline_loss - ) / baseline_loss - - inclusion_mask = full_mask.copy() - household_loss_rel_changes = np.array(household_loss_rel_changes) - # Sort by the relative change in loss - sorted_indices = np.argsort(household_loss_rel_changes) - - # Remove the worst households - num_to_remove = int(len(weights) * fraction_remove_per_iteration) - worst_indices = indices[sorted_indices[:num_to_remove]] - inclusion_mask[worst_indices] = False - - # Calculate the new loss - new_loss = get_loss_from_mask( - weights, - inclusion_mask, - estimate_matrix, - targets, - normalisation_factor, - ) - rel_change = (new_loss - baseline_loss) / baseline_loss - - if rel_change > loss_rel_change_max: - print( - f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, " - f"which is too high ({rel_change:.2%}). Stopping." - ) - break - - print( - f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}" - ) - print( - f"Removed {num_to_remove} households with worst relative loss changes." - ) - - # Update the full mask - full_mask &= inclusion_mask - - return full_mask - - -def random_sampling_minimization( - weights, - estimate_matrix, - targets, - normalisation_factor, - random=True, - target_fractions=[0.5, 0.6, 0.7, 0.8, 0.9], -): - """A simple random sampling approach""" - n = len(weights) - - household_weights_normalized = weights / weights.sum() - - final_mask = None - lowest_loss = float("inf") - for fraction in target_fractions: - target_size = int(n * fraction) - # Random sampling with multiple attempts - best_mask = None - best_loss = float("inf") - - for _ in range(3): # Try 3 random samples - mask = np.zeros(n, dtype=bool) - mask[ - np.random.choice( - n, - target_size, - p=household_weights_normalized if random else None, - replace=False, - ) - ] = True - - loss = get_loss_from_mask( - weights, mask, estimate_matrix, targets, normalisation_factor - ) - - if loss < best_loss: - best_loss = loss - best_mask = mask - - if lowest_loss > best_loss: - lowest_loss = best_loss - final_mask = best_mask - - return final_mask - - -def minimise_dataset( - dataset, - output_path: str, - minimization_function: Callable = candidate_loss_contribution, - **kwargs, -) -> None: - """ - Main function to minimize a dataset using a specified minimization approach. - - Parameters - ---------- - dataset : path to the dataset file or Dataset object - output_path : path where the minimized dataset will be saved - loss_rel_change_max : maximum allowed relative change in loss - minimization_function : function that implements the minimization logic - **kwargs : additional arguments to pass to the minimization function - """ - dataset = str(dataset) - create_calibration_log_file(dataset) - - dataset = Dataset.from_file(dataset) - loss_matrix, targets = build_loss_matrix(dataset, 2024) - - bad_mask = loss_matrix.columns.isin(bad_targets) - keep_mask_bool = ~bad_mask - keep_idx = np.where(keep_mask_bool)[0] - loss_matrix_clean = loss_matrix.iloc[:, keep_idx] - targets_clean = targets[keep_idx] - assert loss_matrix_clean.shape[1] == targets_clean.size - - sim = Microsimulation(dataset=dataset) - - weights = sim.calculate("household_weight", 2024).values - is_national = loss_matrix_clean.columns.str.startswith("nation/") - nation_normalisation_factor = is_national * (1 / is_national.sum()) - state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) - normalisation_factor = np.where( - is_national, nation_normalisation_factor, state_normalisation_factor - ) - - # Call the minimization function - inclusion_mask = minimization_function( - weights=weights, - estimate_matrix=loss_matrix_clean, - targets=targets_clean, - normalisation_factor=normalisation_factor, - **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. - ) - - # Extract household IDs for remaining households - household_ids = sim.calculate("household_id", 2024).values - remaining_households = household_ids[inclusion_mask] - - # Create a smaller dataset with only the remaining households - df = sim.to_input_dataframe() - smaller_df = df[df["household_id__2024"].isin(remaining_households)] - - weight_rel_change = ( - smaller_df["household_weight__2024"].sum() - / df["household_weight__2024"].sum() - ) - print(f"Weight relative change: {weight_rel_change:.2%}") - - # Create new simulation with smaller dataset - sim = Microsimulation(dataset=smaller_df) - - # Rescale weights to maintain total - initial_weights = ( - sim.calculate("household_weight", 2024).values / weight_rel_change - ) - - # Re-calibrate the final selected households to hit targets - print("Re-calibrating final selected households...") - calibrated_weights = reweight( - initial_weights, - loss_matrix_clean.values, # Convert to numpy array - targets_clean, - epochs=10, # Reduced epochs for faster processing - ) - sim.set_input("household_weight", 2024, calibrated_weights) - print("Final calibration completed successfully") - # Prepare data for saving - data = {} - for variable in sim.input_variables: - data[variable] = {2024: sim.calculate(variable, 2024).values} - if data[variable][2024].dtype == "object": - data[variable][2024] = data[variable][2024].astype("S") - - # Save to HDF5 file - with h5py.File(output_path, "w") as f: - for variable, values in data.items(): - for year, value in values.items(): - f.create_dataset(f"{variable}/{year}", data=value) - - print(f"Saved minimised dataset to {output_path}") - create_calibration_log_file(output_path, epoch=500) - - -if __name__ == "__main__": - # Example usage - files = [ - STORAGE_FOLDER / "enhanced_cps_2024.h5", - ] - - for file in files: - output_path = file.with_name(file.stem + "_minimised.h5") - minimise_dataset( - file, - output_path, - ) From 096fb0f2c98b5387da0fc7827a42d75d288fbae9 Mon Sep 17 00:00:00 2001 From: eccuraa Date: Tue, 15 Jul 2025 15:15:49 -0400 Subject: [PATCH 51/58] more american spelling for debugging --- test_minimization_approach.ipynb | 73 +++----------------------------- 1 file changed, 7 insertions(+), 66 deletions(-) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index ea561155..ab96f82f 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,12 +2,12 @@ "cells": [ { "cell_type": "code", - "execution_count": 66, + "execution_count": 2, "id": "d6dc9cca", "metadata": {}, "outputs": [], "source": [ - "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n", + "from policyengine_us_data.utils.minimize import minimize_dataset, random_sampling_minimization, candidate_loss_contribution\n", "from policyengine_us_data.storage import STORAGE_FOLDER\n", "from policyengine_us import Microsimulation\n", "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", @@ -211,74 +211,15 @@ "id": "69ff392d", "metadata": {}, "source": [ - "# Minimise.py approaches" + "# Minimize.py approaches" ] }, { "cell_type": "code", - "execution_count": 67, + "execution_count": null, "id": "aeab67b3", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "=== CALIBRATION LOG DEBUG ===\n", - "File path: /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5\n", - "Epoch: 0\n", - "Number of households: 41310\n", - "Total weight: 12764381616743.21\n", - "Weight range: 0.54 to 1303728.75\n", - "Loss matrix shape: (41310, 2813)\n", - "Number of targets: 2813\n", - "After filtering bad targets:\n", - "Loss matrix clean shape: (41310, 2805)\n", - "Number of clean targets: 2805\n", - "Estimates shape: (2805,)\n", - "Estimates sum: 324584770671300.88\n", - "First 3 estimates: nation/irs/adjusted gross income/total/AGI in -inf-inf/taxable/All 1.498784e+13\n", - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/All 1.609638e+10\n", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/All 6.707770e+10\n", - "dtype: float64\n", - "First 3 targets: [1.62972204e+13 1.68634879e+10 6.76819729e+10]\n", - "Mean absolute error: 17235490830.73\n", - "Mean relative error: 0.0997\n", - "=== END DEBUG ===\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250/250 [03:38<00:00, 1.14it/s, loss=3.34e-5, loss_rel_change=-0.357]\n", - "100%|██████████| 250/250 [02:39<00:00, 1.57it/s, loss=3.52e-5, loss_rel_change=-0.334]\n", - "100%|██████████| 250/250 [01:32<00:00, 2.70it/s, loss=3.39e-5, loss_rel_change=-0.34] \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Weight relative change: 99.95%\n", - "Re-calibrating final selected households...\n" - ] - }, - { - "ename": "AttributeError", - "evalue": "'numpy.ndarray' object has no attribute 'columns'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[67], line 31\u001b[0m\n\u001b[1;32m 29\u001b[0m output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 30\u001b[0m output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 31\u001b[0m \u001b[43mminimise_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 32\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 33\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 34\u001b[0m \u001b[43m \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 35\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 36\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimise.py:430\u001b[0m, in \u001b[0;36mminimise_dataset\u001b[0;34m(dataset, output_path, minimization_function, **kwargs)\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[38;5;66;03m# Re-calibrate the final selected households to hit targets\u001b[39;00m\n\u001b[1;32m 429\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRe-calibrating final selected households...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 430\u001b[0m calibrated_weights \u001b[38;5;241m=\u001b[39m \u001b[43mreweight\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[43minitial_weights\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mloss_matrix_clean\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Convert to numpy array\u001b[39;49;00m\n\u001b[1;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mtargets_clean\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Reduced epochs for faster processing\u001b[39;49;00m\n\u001b[1;32m 435\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 436\u001b[0m sim\u001b[38;5;241m.\u001b[39mset_input(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m2024\u001b[39m, calibrated_weights)\n\u001b[1;32m 437\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFinal calibration completed successfully\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py:47\u001b[0m, in \u001b[0;36mreweight\u001b[0;34m(original_weights, loss_matrix, targets_array, dropout_rate, epochs, log_path, penalty_approach, penalty_weight)\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mreweight\u001b[39m(\n\u001b[1;32m 38\u001b[0m original_weights,\n\u001b[1;32m 39\u001b[0m loss_matrix,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 45\u001b[0m penalty_weight\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 46\u001b[0m ):\n\u001b[0;32m---> 47\u001b[0m target_names \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray(\u001b[43mloss_matrix\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m)\n\u001b[1;32m 48\u001b[0m is_national \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnation/\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 49\u001b[0m loss_matrix \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(loss_matrix\u001b[38;5;241m.\u001b[39mvalues, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n", - "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'" - ] - } - ], + "outputs": [], "source": [ "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", "\n", @@ -310,7 +251,7 @@ " for file in files:\n", " output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n", " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - " minimise_dataset(\n", + " minimize_dataset(\n", " file,\n", " output_path,\n", " minimization_function=minimization_function, \n", @@ -320,7 +261,7 @@ " for file in files:\n", " output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n", " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - " minimise_dataset(\n", + " minimize_dataset(\n", " file,\n", " output_path,\n", " minimization_function=minimization_function, \n", From 41980ac5947497f06942de6c9834bcaca26e7397 Mon Sep 17 00:00:00 2001 From: eccuraa Date: Tue, 15 Jul 2025 15:42:17 -0400 Subject: [PATCH 52/58] initial visualization with synthetic data --- test_minimization_approach.ipynb | 1373 +++++++++++++++++++++++++++++- 1 file changed, 1363 insertions(+), 10 deletions(-) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index ab96f82f..9952ae4e 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,10 +2,19 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "d6dc9cca", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "from policyengine_us_data.utils.minimize import minimize_dataset, random_sampling_minimization, candidate_loss_contribution\n", "from policyengine_us_data.storage import STORAGE_FOLDER\n", @@ -16,6 +25,7 @@ "import os\n", "import h5py\n", "import pandas as pd\n", + "import plotly.express as px\n", "\n", "\n", "bad_targets = [\n", @@ -216,10 +226,68 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "aeab67b3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 31\u001b[0m\n\u001b[1;32m 29\u001b[0m output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 30\u001b[0m output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 31\u001b[0m \u001b[43mminimize_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 32\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 33\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 34\u001b[0m \u001b[43m \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 35\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 36\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:336\u001b[0m, in \u001b[0;36mminimize_dataset\u001b[0;34m(dataset, output_path, minimization_function, loss_matrix, targets, **kwargs)\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 334\u001b[0m dataset_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(dataset)\n\u001b[0;32m--> 336\u001b[0m \u001b[43mcreate_calibration_log_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 338\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(dataset_path)\n\u001b[1;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m loss_matrix \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m targets \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:26\u001b[0m, in \u001b[0;36mcreate_calibration_log_file\u001b[0;34m(file_path, epoch)\u001b[0m\n\u001b[1;32m 23\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(file_path)\n\u001b[1;32m 24\u001b[0m sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39mdataset)\n\u001b[0;32m---> 26\u001b[0m loss_matrix, targets \u001b[38;5;241m=\u001b[39m \u001b[43mbuild_loss_matrix\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2024\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 28\u001b[0m bad_mask \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39misin(bad_targets)\n\u001b[1;32m 29\u001b[0m keep_mask_bool \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m~\u001b[39mbad_mask\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/loss.py:419\u001b[0m, in \u001b[0;36mbuild_loss_matrix\u001b[0;34m(dataset, time_period)\u001b[0m\n\u001b[1;32m 415\u001b[0m targets_array\u001b[38;5;241m.\u001b[39mappend(NET_WORTH_2024)\n\u001b[1;32m 417\u001b[0m \u001b[38;5;66;03m# SALT tax expenditure targeting\u001b[39;00m\n\u001b[0;32m--> 419\u001b[0m \u001b[43m_add_tax_expenditure_targets\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 420\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtime_period\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msim\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mloss_matrix\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtargets_array\u001b[49m\n\u001b[1;32m 421\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 423\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(loss_matrix\u001b[38;5;241m.\u001b[39misna()\u001b[38;5;241m.\u001b[39msum() \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m):\n\u001b[1;32m 424\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSome targets are missing from the loss matrix\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/loss.py:637\u001b[0m, in \u001b[0;36m_add_tax_expenditure_targets\u001b[0;34m(dataset, time_period, baseline_simulation, loss_matrix, targets_array)\u001b[0m\n\u001b[1;32m 634\u001b[0m simulation\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;241m=\u001b[39m time_period\n\u001b[1;32m 636\u001b[0m \u001b[38;5;66;03m# Calculate the baseline and reform income tax values.\u001b[39;00m\n\u001b[0;32m--> 637\u001b[0m income_tax_r \u001b[38;5;241m=\u001b[39m \u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 638\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mincome_tax\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhousehold\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 639\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 641\u001b[0m \u001b[38;5;66;03m# Compute the tax expenditure (TE) values.\u001b[39;00m\n\u001b[1;32m 642\u001b[0m te_values \u001b[38;5;241m=\u001b[39m income_tax_r \u001b[38;5;241m-\u001b[39m income_tax_b\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 1003\u001b[0m array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/income_tax.py:18\u001b[0m, in \u001b[0;36mincome_tax.formula\u001b[0;34m(person, period, parameters)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 18\u001b[0m added_components \u001b[38;5;241m=\u001b[39m \u001b[43madd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m \u001b[49m\u001b[43mperson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mincome_tax_before_refundable_credits\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 21\u001b[0m subtracted_components \u001b[38;5;241m=\u001b[39m add(\n\u001b[1;32m 22\u001b[0m person, period, [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mincome_tax_refundable_credits\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 23\u001b[0m )\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m added_components \u001b[38;5;241m-\u001b[39m subtracted_components\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:227\u001b[0m, in \u001b[0;36madd\u001b[0;34m(entity, period, variables, options)\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21madd\u001b[39m(\n\u001b[1;32m 208\u001b[0m entity: Population,\n\u001b[1;32m 209\u001b[0m period: Period,\n\u001b[1;32m 210\u001b[0m variables: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m 211\u001b[0m options: List[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 212\u001b[0m ):\n\u001b[1;32m 213\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Sums a list of variables.\u001b[39;00m\n\u001b[1;32m 214\u001b[0m \n\u001b[1;32m 215\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[38;5;124;03m ArrayLike: The result of the operation.\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfor_each_variable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[43m \u001b[49m\u001b[43mentity\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magg_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43madd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\n\u001b[1;32m 229\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:183\u001b[0m, in \u001b[0;36mfor_each_variable\u001b[0;34m(entity, period, variables, agg_func, group_agg_func, options)\u001b[0m\n\u001b[1;32m 181\u001b[0m variable_entity \u001b[38;5;241m=\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mget_variable(variable)\u001b[38;5;241m.\u001b[39mentity\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;241m==\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mkey:\n\u001b[0;32m--> 183\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43mentity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mis_person:\n\u001b[1;32m 185\u001b[0m values \u001b[38;5;241m=\u001b[39m group_agg_func(\n\u001b[1;32m 186\u001b[0m entity\u001b[38;5;241m.\u001b[39mmembers(variable, period, options\u001b[38;5;241m=\u001b[39moptions)\n\u001b[1;32m 187\u001b[0m )\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m 134\u001b[0m variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 1003\u001b[0m array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/income_tax_before_refundable_credits.py:18\u001b[0m, in \u001b[0;36mincome_tax_before_refundable_credits.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 18\u001b[0m added_components \u001b[38;5;241m=\u001b[39m \u001b[43madd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m \u001b[49m\u001b[43mtax_unit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mincome_tax_before_credits\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnet_investment_income_tax\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 24\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrecapture_of_investment_credit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 25\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43munreported_payroll_tax\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 26\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mqualified_retirement_penalty\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 27\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 28\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 29\u001b[0m subtracted_components \u001b[38;5;241m=\u001b[39m add(\n\u001b[1;32m 30\u001b[0m tax_unit, period, [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mincome_tax_capped_non_refundable_credits\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 31\u001b[0m )\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m added_components \u001b[38;5;241m-\u001b[39m subtracted_components\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:227\u001b[0m, in \u001b[0;36madd\u001b[0;34m(entity, period, variables, options)\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21madd\u001b[39m(\n\u001b[1;32m 208\u001b[0m entity: Population,\n\u001b[1;32m 209\u001b[0m period: Period,\n\u001b[1;32m 210\u001b[0m variables: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m 211\u001b[0m options: List[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 212\u001b[0m ):\n\u001b[1;32m 213\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Sums a list of variables.\u001b[39;00m\n\u001b[1;32m 214\u001b[0m \n\u001b[1;32m 215\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[38;5;124;03m ArrayLike: The result of the operation.\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfor_each_variable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[43m \u001b[49m\u001b[43mentity\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magg_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43madd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\n\u001b[1;32m 229\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:183\u001b[0m, in \u001b[0;36mfor_each_variable\u001b[0;34m(entity, period, variables, agg_func, group_agg_func, options)\u001b[0m\n\u001b[1;32m 181\u001b[0m variable_entity \u001b[38;5;241m=\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mget_variable(variable)\u001b[38;5;241m.\u001b[39mentity\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;241m==\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mkey:\n\u001b[0;32m--> 183\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43mentity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mis_person:\n\u001b[1;32m 185\u001b[0m values \u001b[38;5;241m=\u001b[39m group_agg_func(\n\u001b[1;32m 186\u001b[0m entity\u001b[38;5;241m.\u001b[39mmembers(variable, period, options\u001b[38;5;241m=\u001b[39moptions)\n\u001b[1;32m 187\u001b[0m )\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m 134\u001b[0m variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:940\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 938\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m adds_list:\n\u001b[1;32m 939\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mvariables:\n\u001b[0;32m--> 940\u001b[0m values \u001b[38;5;241m=\u001b[39m values \u001b[38;5;241m+\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 941\u001b[0m \u001b[43m \u001b[49m\u001b[43madded_variable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mentity\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkey\u001b[49m\n\u001b[1;32m 942\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 943\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 944\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 1003\u001b[0m array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/alternative_minimum_tax/alternative_minimum_tax.py:24\u001b[0m, in \u001b[0;36malternative_minimum_tax.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m 21\u001b[0m amt_base_tax \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mamt_base_tax\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;66;03m# Tax on capital gains (Part III)\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m form_6251_part_iii_required \u001b[38;5;241m=\u001b[39m \u001b[43mtax_unit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mamt_part_iii_required\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 26\u001b[0m amt_tax_including_cg \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mamt_tax_including_cg\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m 27\u001b[0m smaller_tax \u001b[38;5;241m=\u001b[39m min_(amt_base_tax, amt_tax_including_cg)\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m 134\u001b[0m variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 1003\u001b[0m array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/alternative_minimum_tax/amt_part_iii_required.py:13\u001b[0m, in \u001b[0;36mamt_part_iii_required.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mformula\u001b[39m(tax_unit, period, parameters):\n\u001b[0;32m---> 13\u001b[0m relevant_inputs \u001b[38;5;241m=\u001b[39m \u001b[43madd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43mtax_unit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks10\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks13\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks14\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks19\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43munrecaptured_section_1250_gain\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m relevant_inputs \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:227\u001b[0m, in \u001b[0;36madd\u001b[0;34m(entity, period, variables, options)\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21madd\u001b[39m(\n\u001b[1;32m 208\u001b[0m entity: Population,\n\u001b[1;32m 209\u001b[0m period: Period,\n\u001b[1;32m 210\u001b[0m variables: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m 211\u001b[0m options: List[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 212\u001b[0m ):\n\u001b[1;32m 213\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Sums a list of variables.\u001b[39;00m\n\u001b[1;32m 214\u001b[0m \n\u001b[1;32m 215\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[38;5;124;03m ArrayLike: The result of the operation.\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfor_each_variable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[43m \u001b[49m\u001b[43mentity\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magg_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43madd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\n\u001b[1;32m 229\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:183\u001b[0m, in \u001b[0;36mfor_each_variable\u001b[0;34m(entity, period, variables, agg_func, group_agg_func, options)\u001b[0m\n\u001b[1;32m 181\u001b[0m variable_entity \u001b[38;5;241m=\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mget_variable(variable)\u001b[38;5;241m.\u001b[39mentity\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;241m==\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mkey:\n\u001b[0;32m--> 183\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43mentity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mis_person:\n\u001b[1;32m 185\u001b[0m values \u001b[38;5;241m=\u001b[39m group_agg_func(\n\u001b[1;32m 186\u001b[0m entity\u001b[38;5;241m.\u001b[39mmembers(variable, period, options\u001b[38;5;241m=\u001b[39moptions)\n\u001b[1;32m 187\u001b[0m )\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m 134\u001b[0m variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:612\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 610\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVariable \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvariable_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not exist.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 611\u001b[0m population \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_variable_population(variable_name)\n\u001b[0;32m--> 612\u001b[0m holder \u001b[38;5;241m=\u001b[39m \u001b[43mpopulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_holder\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 613\u001b[0m variable \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mget_variable(\n\u001b[1;32m 614\u001b[0m variable_name, check_existence\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 615\u001b[0m )\n\u001b[1;32m 617\u001b[0m \u001b[38;5;66;03m# Check if we've neutralized via parameters.\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:145\u001b[0m, in \u001b[0;36mPopulation.get_holder\u001b[0;34m(self, variable_name)\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mget_holder\u001b[39m(\u001b[38;5;28mself\u001b[39m, variable_name: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Holder:\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mcheck_variable_defined_for_entity(variable_name)\n\u001b[0;32m--> 145\u001b[0m holder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_holders\u001b[38;5;241m.\u001b[39mget(variable_name)\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m holder:\n\u001b[1;32m 147\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m holder\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], "source": [ "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", "\n", @@ -711,10 +779,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "9602953a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " strategy parameter dataset_size total_loss\n", + "0 none none 41310 0.0069\n", + "1 l0_sigmoid 1.0 41310 0.0069\n", + "2 l0_sigmoid 0.1 37048 5.8041\n", + "3 l0_sigmoid 0.01 32786 13.3255\n", + "4 l0_sigmoid 0.001 28524 21.6723\n", + "5 l0_sigmoid 0.0001 24262 30.6049\n", + "6 l0_sigmoid 0.00001 20000 40.0000\n", + "7 l0_log 1.0 41310 0.0069\n", + "8 l0_log 0.1 37048 8.7028\n", + "9 l0_log 0.01 32786 19.9847\n", + "10 l0_log 0.001 28524 32.5050\n", + "11 l0_log 0.0001 24262 45.9039\n", + "12 l0_log 0.00001 20000 59.9965\n", + "13 l0_exp 1.0 41310 0.0069\n", + "14 l0_exp 0.1 37048 11.6014\n", + "15 l0_exp 0.01 32786 26.6440\n", + "16 l0_exp 0.001 28524 43.3377\n", + "17 l0_exp 0.0001 24262 61.2029\n", + "18 l0_exp 0.00001 20000 79.9931\n", + "19 l1 1.0 41310 0.0069\n", + "20 l1 0.1 37048 14.5000\n", + "21 l1 0.01 32786 33.3033\n", + "22 l1 0.001 28524 54.1704\n", + "23 l1 0.0001 24262 76.5019\n", + "24 l1 0.00001 20000 99.9896\n" + ] + } + ], "source": [ "'''\n", "Synthetic dataset\n", @@ -730,14 +831,28 @@ "base_loss = 0.0069\n", "max_loss = 40.0\n", "\n", - "# Construct rows\n", + "strategy_slopes = {\n", + " 'l0_sigmoid': 1.0,\n", + " 'l0_log': 1.5,\n", + " 'l0_exp': 2.0,\n", + " 'l1': 2.5,\n", + "}\n", + "\n", "rows = [{'strategy': 'none', 'parameter': 'none', 'dataset_size': base_size, 'total_loss': base_loss}]\n", "\n", "for strategy in strategies:\n", + " slope = strategy_slopes[strategy]\n", + " \n", " for i, param in enumerate(parameters):\n", - " # Gradually decrease size and increase loss\n", - " size = int(base_size - (base_size - min_size) * (i / (len(parameters) - 1)))\n", - " loss = round(base_loss + (max_loss - base_loss) * (i / (len(parameters) - 1)), 4)\n", + " # Normalized compression level: 0 (no compression) to 1 (max compression)\n", + " compression_level = i / (len(parameters) - 1)\n", + " \n", + " # Size shrinks linearly\n", + " size = int(base_size - (base_size - min_size) * compression_level)\n", + " \n", + " # Loss increases quadratically (or linearly) based on strategy slope\n", + " loss = round(base_loss + slope * (max_loss - base_loss) * (compression_level ** 1.2), 4)\n", + " \n", " rows.append({\n", " 'strategy': strategy,\n", " 'parameter': param,\n", @@ -751,6 +866,1244 @@ "# Display\n", "print(reg_results_df)" ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2dc0891c", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "plotlyServerURL": "https://plot.ly" + }, + "data": [ + { + "customdata": [ + [ + "l0_exp" + ], + [ + "l0_exp" + ], + [ + "l0_exp" + ], + [ + "l0_exp" + ], + [ + "l0_exp" + ], + [ + "l0_exp" + ] + ], + "hovertemplate": "Strategy: %{customdata[0]}
Size: %{x}
Loss: %{y:.4f}
Param: %{text}", + "legendgroup": "l0_exp", + "line": { + "color": "#636efa", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines+markers+text", + "name": "l0_exp", + "orientation": "v", + "showlegend": true, + "text": [ + "1.0", + "0.1", + "0.01", + "0.001", + "0.0001", + "1e-05" + ], + "textposition": "top center", + "type": "scatter", + "x": [ + 41310, + 37048, + 32786, + 28524, + 24262, + 20000 + ], + "xaxis": "x", + "y": [ + 0.0069, + 11.6014, + 26.644, + 43.3377, + 61.2029, + 79.9931 + ], + "yaxis": "y" + }, + { + "customdata": [ + [ + "l0_log" + ], + [ + "l0_log" + ], + [ + "l0_log" + ], + [ + "l0_log" + ], + [ + "l0_log" + ], + [ + "l0_log" + ] + ], + "hovertemplate": "Strategy: %{customdata[0]}
Size: %{x}
Loss: %{y:.4f}
Param: %{text}", + "legendgroup": "l0_log", + "line": { + "color": "#EF553B", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines+markers+text", + "name": "l0_log", + "orientation": "v", + "showlegend": true, + "text": [ + "1.0", + "0.1", + "0.01", + "0.001", + "0.0001", + "1e-05" + ], + "textposition": "top center", + "type": "scatter", + "x": [ + 41310, + 37048, + 32786, + 28524, + 24262, + 20000 + ], + "xaxis": "x", + "y": [ + 0.0069, + 8.7028, + 19.9847, + 32.505, + 45.9039, + 59.9965 + ], + "yaxis": "y" + }, + { + "customdata": [ + [ + "l0_sigmoid" + ], + [ + "l0_sigmoid" + ], + [ + "l0_sigmoid" + ], + [ + "l0_sigmoid" + ], + [ + "l0_sigmoid" + ], + [ + "l0_sigmoid" + ] + ], + "hovertemplate": "Strategy: %{customdata[0]}
Size: %{x}
Loss: %{y:.4f}
Param: %{text}", + "legendgroup": "l0_sigmoid", + "line": { + "color": "#00cc96", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines+markers+text", + "name": "l0_sigmoid", + "orientation": "v", + "showlegend": true, + "text": [ + "1.0", + "0.1", + "0.01", + "0.001", + "0.0001", + "1e-05" + ], + "textposition": "top center", + "type": "scatter", + "x": [ + 41310, + 37048, + 32786, + 28524, + 24262, + 20000 + ], + "xaxis": "x", + "y": [ + 0.0069, + 5.8041, + 13.3255, + 21.6723, + 30.6049, + 40 + ], + "yaxis": "y" + }, + { + "customdata": [ + [ + "l1" + ], + [ + "l1" + ], + [ + "l1" + ], + [ + "l1" + ], + [ + "l1" + ], + [ + "l1" + ] + ], + "hovertemplate": "Strategy: %{customdata[0]}
Size: %{x}
Loss: %{y:.4f}
Param: %{text}", + "legendgroup": "l1", + "line": { + "color": "#ab63fa", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines+markers+text", + "name": "l1", + "orientation": "v", + "showlegend": true, + "text": [ + "1.0", + "0.1", + "0.01", + "0.001", + "0.0001", + "1e-05" + ], + "textposition": "top center", + "type": "scatter", + "x": [ + 41310, + 37048, + 32786, + 28524, + 24262, + 20000 + ], + "xaxis": "x", + "y": [ + 0.0069, + 14.5, + 33.3033, + 54.1704, + 76.5019, + 99.9896 + ], + "yaxis": "y" + } + ], + "layout": { + "annotations": [ + { + "arrowhead": 1, + "ax": 40, + "ay": -40, + "font": { + "color": "gray" + }, + "showarrow": true, + "text": "Baseline", + "x": 41310, + "y": 0.0069 + } + ], + "height": 600, + "hovermode": "closest", + "legend": { + "title": { + "text": "Strategy" + }, + "tracegroupgap": 0 + }, + "shapes": [ + { + "line": { + "color": "gray", + "dash": "dash" + }, + "name": "Baseline Size", + "type": "line", + "x0": 41310, + "x1": 41310, + "y0": 0.0069, + "y1": 99.9896 + }, + { + "line": { + "color": "gray", + "dash": "dash" + }, + "name": "Baseline Loss", + "type": "line", + "x0": 20000, + "x1": 41310, + "y0": 0.0069, + "y1": 0.0069 + } + ], + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "fillpattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "ECPS Regularization Strategy Comparison" + }, + "width": 900, + "xaxis": { + "anchor": "y", + "domain": [ + 0, + 1 + ], + "title": { + "text": "Number of Households" + } + }, + "yaxis": { + "anchor": "x", + "domain": [ + 0, + 1 + ], + "title": { + "text": "Calibration Score" + } + } + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Filter out the baseline row\n", + "df_plot = reg_results_df[reg_results_df['strategy'] != 'none'].copy()\n", + "df_plot['parameter'] = df_plot['parameter'].astype(str)\n", + "df_plot = df_plot.sort_values(by=['strategy', 'dataset_size'], ascending=[True, False])\n", + "\n", + "# Create line plot\n", + "fig = px.line(\n", + " df_plot,\n", + " x=\"dataset_size\",\n", + " y=\"total_loss\",\n", + " color=\"strategy\",\n", + " markers=True,\n", + " text=\"parameter\",\n", + " custom_data=[\"strategy\"],\n", + " title=\"ECPS Regularization Strategy Comparison\",\n", + " labels={\n", + " \"dataset_size\": \"Number of Households\",\n", + " \"total_loss\": \"Calibration Score\",\n", + " \"strategy\": \"Regularization Approach\"\n", + " }\n", + ")\n", + "\n", + "# Add text labels (parameter) on hover\n", + "fig.update_traces(\n", + " textposition=\"top center\", \n", + " hovertemplate=(\n", + " \"Strategy: %{customdata[0]}
\"\n", + " \"Size: %{x}
\"\n", + " \"Loss: %{y:.4f}
\"\n", + " \"Param: %{text}\"\n", + " )\n", + ")\n", + "\n", + "# Add baseline lines\n", + "baseline = reg_results_df[reg_results_df['strategy'] == 'none'].iloc[0]\n", + "\n", + "fig.add_shape(\n", + " type=\"line\",\n", + " x0=baseline[\"dataset_size\"], x1=baseline[\"dataset_size\"],\n", + " y0=df_plot[\"total_loss\"].min(), y1=df_plot[\"total_loss\"].max(),\n", + " line=dict(color=\"gray\", dash=\"dash\"),\n", + " name=\"Baseline Size\"\n", + ")\n", + "\n", + "fig.add_shape(\n", + " type=\"line\",\n", + " x0=df_plot[\"dataset_size\"].min(), x1=df_plot[\"dataset_size\"].max(),\n", + " y0=baseline[\"total_loss\"], y1=baseline[\"total_loss\"],\n", + " line=dict(color=\"gray\", dash=\"dash\"),\n", + " name=\"Baseline Loss\"\n", + ")\n", + "\n", + "# Add annotation for the baseline\n", + "fig.add_annotation(\n", + " x=baseline[\"dataset_size\"],\n", + " y=baseline[\"total_loss\"],\n", + " text=\"Baseline\",\n", + " showarrow=True,\n", + " arrowhead=1,\n", + " ax=40,\n", + " ay=-40,\n", + " font=dict(color=\"gray\"),\n", + ")\n", + "\n", + "# Final layout adjustments\n", + "fig.update_layout(\n", + " legend_title=\"Strategy\",\n", + " hovermode=\"closest\",\n", + " width=900,\n", + " height=600\n", + ")\n", + "\n", + "fig.show()" + ] } ], "metadata": { From b3208db3a85d703081874c17f80d0ae20e8a02a2 Mon Sep 17 00:00:00 2001 From: eccuraa Date: Tue, 15 Jul 2025 15:55:42 -0400 Subject: [PATCH 53/58] full test arena?? (trying it now) --- test_minimization_approach.ipynb | 147 ++++++++----------------------- 1 file changed, 35 insertions(+), 112 deletions(-) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 9952ae4e..6c3921f0 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,19 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "d6dc9cca", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "from policyengine_us_data.utils.minimize import minimize_dataset, random_sampling_minimization, candidate_loss_contribution\n", "from policyengine_us_data.storage import STORAGE_FOLDER\n", @@ -226,65 +217,15 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "aeab67b3", "metadata": {}, "outputs": [ { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 31\u001b[0m\n\u001b[1;32m 29\u001b[0m output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 30\u001b[0m output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 31\u001b[0m \u001b[43mminimize_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 32\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 33\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 34\u001b[0m \u001b[43m \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 35\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 36\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:336\u001b[0m, in \u001b[0;36mminimize_dataset\u001b[0;34m(dataset, output_path, minimization_function, loss_matrix, targets, **kwargs)\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 334\u001b[0m dataset_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(dataset)\n\u001b[0;32m--> 336\u001b[0m \u001b[43mcreate_calibration_log_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 338\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(dataset_path)\n\u001b[1;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m loss_matrix \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m targets \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:26\u001b[0m, in \u001b[0;36mcreate_calibration_log_file\u001b[0;34m(file_path, epoch)\u001b[0m\n\u001b[1;32m 23\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(file_path)\n\u001b[1;32m 24\u001b[0m sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39mdataset)\n\u001b[0;32m---> 26\u001b[0m loss_matrix, targets \u001b[38;5;241m=\u001b[39m \u001b[43mbuild_loss_matrix\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2024\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 28\u001b[0m bad_mask \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39misin(bad_targets)\n\u001b[1;32m 29\u001b[0m keep_mask_bool \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m~\u001b[39mbad_mask\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/loss.py:419\u001b[0m, in \u001b[0;36mbuild_loss_matrix\u001b[0;34m(dataset, time_period)\u001b[0m\n\u001b[1;32m 415\u001b[0m targets_array\u001b[38;5;241m.\u001b[39mappend(NET_WORTH_2024)\n\u001b[1;32m 417\u001b[0m \u001b[38;5;66;03m# SALT tax expenditure targeting\u001b[39;00m\n\u001b[0;32m--> 419\u001b[0m \u001b[43m_add_tax_expenditure_targets\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 420\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtime_period\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msim\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mloss_matrix\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtargets_array\u001b[49m\n\u001b[1;32m 421\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 423\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(loss_matrix\u001b[38;5;241m.\u001b[39misna()\u001b[38;5;241m.\u001b[39msum() \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m):\n\u001b[1;32m 424\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSome targets are missing from the loss matrix\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/loss.py:637\u001b[0m, in \u001b[0;36m_add_tax_expenditure_targets\u001b[0;34m(dataset, time_period, baseline_simulation, loss_matrix, targets_array)\u001b[0m\n\u001b[1;32m 634\u001b[0m simulation\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;241m=\u001b[39m time_period\n\u001b[1;32m 636\u001b[0m \u001b[38;5;66;03m# Calculate the baseline and reform income tax values.\u001b[39;00m\n\u001b[0;32m--> 637\u001b[0m income_tax_r \u001b[38;5;241m=\u001b[39m \u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 638\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mincome_tax\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhousehold\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 639\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 641\u001b[0m \u001b[38;5;66;03m# Compute the tax expenditure (TE) values.\u001b[39;00m\n\u001b[1;32m 642\u001b[0m te_values \u001b[38;5;241m=\u001b[39m income_tax_r \u001b[38;5;241m-\u001b[39m income_tax_b\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 1003\u001b[0m array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/income_tax.py:18\u001b[0m, in \u001b[0;36mincome_tax.formula\u001b[0;34m(person, period, parameters)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 18\u001b[0m added_components \u001b[38;5;241m=\u001b[39m \u001b[43madd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m \u001b[49m\u001b[43mperson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mincome_tax_before_refundable_credits\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 21\u001b[0m subtracted_components \u001b[38;5;241m=\u001b[39m add(\n\u001b[1;32m 22\u001b[0m person, period, [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mincome_tax_refundable_credits\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 23\u001b[0m )\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m added_components \u001b[38;5;241m-\u001b[39m subtracted_components\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:227\u001b[0m, in \u001b[0;36madd\u001b[0;34m(entity, period, variables, options)\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21madd\u001b[39m(\n\u001b[1;32m 208\u001b[0m entity: Population,\n\u001b[1;32m 209\u001b[0m period: Period,\n\u001b[1;32m 210\u001b[0m variables: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m 211\u001b[0m options: List[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 212\u001b[0m ):\n\u001b[1;32m 213\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Sums a list of variables.\u001b[39;00m\n\u001b[1;32m 214\u001b[0m \n\u001b[1;32m 215\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[38;5;124;03m ArrayLike: The result of the operation.\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfor_each_variable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[43m \u001b[49m\u001b[43mentity\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magg_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43madd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\n\u001b[1;32m 229\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:183\u001b[0m, in \u001b[0;36mfor_each_variable\u001b[0;34m(entity, period, variables, agg_func, group_agg_func, options)\u001b[0m\n\u001b[1;32m 181\u001b[0m variable_entity \u001b[38;5;241m=\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mget_variable(variable)\u001b[38;5;241m.\u001b[39mentity\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;241m==\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mkey:\n\u001b[0;32m--> 183\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43mentity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mis_person:\n\u001b[1;32m 185\u001b[0m values \u001b[38;5;241m=\u001b[39m group_agg_func(\n\u001b[1;32m 186\u001b[0m entity\u001b[38;5;241m.\u001b[39mmembers(variable, period, options\u001b[38;5;241m=\u001b[39moptions)\n\u001b[1;32m 187\u001b[0m )\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m 134\u001b[0m variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 1003\u001b[0m array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/income_tax_before_refundable_credits.py:18\u001b[0m, in \u001b[0;36mincome_tax_before_refundable_credits.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 18\u001b[0m added_components \u001b[38;5;241m=\u001b[39m \u001b[43madd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m \u001b[49m\u001b[43mtax_unit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mincome_tax_before_credits\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnet_investment_income_tax\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 24\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrecapture_of_investment_credit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 25\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43munreported_payroll_tax\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 26\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mqualified_retirement_penalty\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 27\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 28\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 29\u001b[0m subtracted_components \u001b[38;5;241m=\u001b[39m add(\n\u001b[1;32m 30\u001b[0m tax_unit, period, [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mincome_tax_capped_non_refundable_credits\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 31\u001b[0m )\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m added_components \u001b[38;5;241m-\u001b[39m subtracted_components\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:227\u001b[0m, in \u001b[0;36madd\u001b[0;34m(entity, period, variables, options)\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21madd\u001b[39m(\n\u001b[1;32m 208\u001b[0m entity: Population,\n\u001b[1;32m 209\u001b[0m period: Period,\n\u001b[1;32m 210\u001b[0m variables: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m 211\u001b[0m options: List[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 212\u001b[0m ):\n\u001b[1;32m 213\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Sums a list of variables.\u001b[39;00m\n\u001b[1;32m 214\u001b[0m \n\u001b[1;32m 215\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[38;5;124;03m ArrayLike: The result of the operation.\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfor_each_variable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[43m \u001b[49m\u001b[43mentity\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magg_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43madd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\n\u001b[1;32m 229\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:183\u001b[0m, in \u001b[0;36mfor_each_variable\u001b[0;34m(entity, period, variables, agg_func, group_agg_func, options)\u001b[0m\n\u001b[1;32m 181\u001b[0m variable_entity \u001b[38;5;241m=\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mget_variable(variable)\u001b[38;5;241m.\u001b[39mentity\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;241m==\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mkey:\n\u001b[0;32m--> 183\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43mentity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mis_person:\n\u001b[1;32m 185\u001b[0m values \u001b[38;5;241m=\u001b[39m group_agg_func(\n\u001b[1;32m 186\u001b[0m entity\u001b[38;5;241m.\u001b[39mmembers(variable, period, options\u001b[38;5;241m=\u001b[39moptions)\n\u001b[1;32m 187\u001b[0m )\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m 134\u001b[0m variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:940\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 938\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m adds_list:\n\u001b[1;32m 939\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mvariables:\n\u001b[0;32m--> 940\u001b[0m values \u001b[38;5;241m=\u001b[39m values \u001b[38;5;241m+\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 941\u001b[0m \u001b[43m \u001b[49m\u001b[43madded_variable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mentity\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkey\u001b[49m\n\u001b[1;32m 942\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 943\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 944\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 1003\u001b[0m array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/alternative_minimum_tax/alternative_minimum_tax.py:24\u001b[0m, in \u001b[0;36malternative_minimum_tax.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m 21\u001b[0m amt_base_tax \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mamt_base_tax\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;66;03m# Tax on capital gains (Part III)\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m form_6251_part_iii_required \u001b[38;5;241m=\u001b[39m \u001b[43mtax_unit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mamt_part_iii_required\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 26\u001b[0m amt_tax_including_cg \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mamt_tax_including_cg\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m 27\u001b[0m smaller_tax \u001b[38;5;241m=\u001b[39m min_(amt_base_tax, amt_tax_including_cg)\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m 134\u001b[0m variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 1003\u001b[0m array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/irs/tax/federal_income/alternative_minimum_tax/amt_part_iii_required.py:13\u001b[0m, in \u001b[0;36mamt_part_iii_required.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mformula\u001b[39m(tax_unit, period, parameters):\n\u001b[0;32m---> 13\u001b[0m relevant_inputs \u001b[38;5;241m=\u001b[39m \u001b[43madd\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43mtax_unit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks10\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks13\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks14\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdwks19\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43munrecaptured_section_1250_gain\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m relevant_inputs \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:227\u001b[0m, in \u001b[0;36madd\u001b[0;34m(entity, period, variables, options)\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21madd\u001b[39m(\n\u001b[1;32m 208\u001b[0m entity: Population,\n\u001b[1;32m 209\u001b[0m period: Period,\n\u001b[1;32m 210\u001b[0m variables: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m 211\u001b[0m options: List[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 212\u001b[0m ):\n\u001b[1;32m 213\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Sums a list of variables.\u001b[39;00m\n\u001b[1;32m 214\u001b[0m \n\u001b[1;32m 215\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[38;5;124;03m ArrayLike: The result of the operation.\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 227\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfor_each_variable\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[43m \u001b[49m\u001b[43mentity\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magg_func\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43madd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\n\u001b[1;32m 229\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/commons/formulas.py:183\u001b[0m, in \u001b[0;36mfor_each_variable\u001b[0;34m(entity, period, variables, agg_func, group_agg_func, options)\u001b[0m\n\u001b[1;32m 181\u001b[0m variable_entity \u001b[38;5;241m=\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mget_variable(variable)\u001b[38;5;241m.\u001b[39mentity\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;241m==\u001b[39m entity\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mkey:\n\u001b[0;32m--> 183\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43mentity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable_entity\u001b[38;5;241m.\u001b[39mis_person:\n\u001b[1;32m 185\u001b[0m values \u001b[38;5;241m=\u001b[39m group_agg_func(\n\u001b[1;32m 186\u001b[0m entity\u001b[38;5;241m.\u001b[39mmembers(variable, period, options\u001b[38;5;241m=\u001b[39moptions)\n\u001b[1;32m 187\u001b[0m )\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m 134\u001b[0m variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:612\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 610\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVariable \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvariable_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not exist.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 611\u001b[0m population \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_variable_population(variable_name)\n\u001b[0;32m--> 612\u001b[0m holder \u001b[38;5;241m=\u001b[39m \u001b[43mpopulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_holder\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 613\u001b[0m variable \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mget_variable(\n\u001b[1;32m 614\u001b[0m variable_name, check_existence\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 615\u001b[0m )\n\u001b[1;32m 617\u001b[0m \u001b[38;5;66;03m# Check if we've neutralized via parameters.\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:145\u001b[0m, in \u001b[0;36mPopulation.get_holder\u001b[0;34m(self, variable_name)\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mget_holder\u001b[39m(\u001b[38;5;28mself\u001b[39m, variable_name: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Holder:\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mentity\u001b[38;5;241m.\u001b[39mcheck_variable_defined_for_entity(variable_name)\n\u001b[0;32m--> 145\u001b[0m holder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_holders\u001b[38;5;241m.\u001b[39mget(variable_name)\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m holder:\n\u001b[1;32m 147\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m holder\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + "name": "stderr", + "output_type": "stream", + "text": [ + " 91%|█████████ | 228/250 [01:44<00:21, 1.04it/s, loss=3.27e-5, loss_rel_change=-0.384]" ] } ], @@ -337,25 +278,6 @@ " )" ] }, - { - "cell_type": "markdown", - "id": "fa1ea957", - "metadata": {}, - "source": [ - "### (Temporary) Cleaning of data (removing weights smaller than epsilon)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e88df261", - "metadata": {}, - "outputs": [], - "source": [ - "## this should go in the enhanced_cps_2024.py file, because household removal doesn't happen there\n", - "# Need to check Ben's PR." - ] - }, { "cell_type": "markdown", "id": "f8b0fe2e", @@ -463,7 +385,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": null, "id": "7bb3ef3c", "metadata": {}, "outputs": [ @@ -710,13 +632,9 @@ ], "source": [ "\"\"\"\n", - "Pulling values from created calibration_log.csv and .h5 files to populate the line plot dataframe\n", - "\n", - "( I need to pull the strategy (folder name), parameter (from file title??), dataset size (from length of .h5 file), and total loss (from sum of loss column in calibration_log_file.csv))\n", - "\n", - "approaches = [\"l0_exp\", \"l1\"] \n", - "penalty_weights = [1e-2, 1e-1]\n", + "Scraping values from created calibration_log.csv and .h5 files to populate the plotting dataframe\n", "\"\"\"\n", + "\n", "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", "penalty_weights = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]\n", "og_size = 41310 # Original size of the dataset\n", @@ -731,11 +649,9 @@ " # Pull length of .h5 file\n", " h5_name = f\"enhanced_cps_2024_{strategy}_{parameter}_minimised.h5\"\n", " h5_path = get_output_path(strategy, h5_name)\n", - " # see if this works\n", " dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n", - " #total_size.append(dataset_size)\n", "\n", - " # Pull sum of loss column\n", + " # Pull score of loss column\n", " cal_log_name = f\"calibration_log_{strategy}_{parameter}.csv\"\n", " cal_log_path = get_output_path(strategy, cal_log_name)\n", " calibration_log = pd.read_csv(cal_log_path)\n", @@ -745,27 +661,30 @@ "\n", "\n", "\n", - "'''\n", - "\n", - "fraction = [0.5, 0.6, 0.7, 0.8, 0.9]\n", + "approaches = {\n", + " \"random_sampling_minimization\":[0.5, 0.6, 0.7, 0.8, 0.9], \n", + " \"candidate_loss_contribution\": [0.001, 0.0001, 0.00001, 0.000001, 0.0000001],\n", + "}\n", "\n", - "for fraction in fraction:\n", - " strategy = \"random_sampling_minimization\"\n", - " parameter = fraction\n", + "for approach, fractions in approaches.items(): # Use .items() to get key-value pairs\n", + " for fraction in fractions:\n", + " strategy = approach\n", + " parameter = fraction\n", "\n", - " # Pull length of .h5 file\n", - " h5_name = f\"{fraction}_enhanced_cps_2024_random_sampling_minimization_minimised.h5\"\n", - " h5_path = STORAGE_FOLDER / strategy / h5_name\n", - " dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n", + " # Pull length of .h5 file\n", + " h5_name = f\"{fraction}_enhanced_cps_2024_{approach}_minimised.h5\"\n", + " h5_path = STORAGE_FOLDER / strategy / h5_name\n", + " dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n", "\n", - " # Pull sum of loss column\n", - " cal_log_name = f\"{fraction}_enhanced_cps_2024_random_sampling_minimization_minimised_calibration_log.csv\"\n", - " cal_log_path = STORAGE_FOLDER / strategy / cal_log_name\n", - " total_loss = pd.read_csv(cal_log_path)['loss'].sum()\n", + " # Pull sum of loss column\n", + " cal_log_name = f\"{fraction}_enhanced_cps_2024_{approach}_minimised_calibration_log.csv\"\n", + " cal_log_path = STORAGE_FOLDER / strategy / cal_log_name\n", + " cal_log_path = get_output_path(strategy, cal_log_name)\n", + " calibration_log = pd.read_csv(cal_log_path)\n", + " loss_value = loss_score(calibration_log)\n", "\n", - " add_result(df, strategy, parameter, dataset_size, total_loss)\n", + " reg_results_df = add_result(reg_results_df, strategy, parameter, dataset_size, loss_value)\n", "\n", - "'''\n", "reg_results_df\n" ] }, @@ -869,7 +788,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "2dc0891c", "metadata": {}, "outputs": [ @@ -2030,6 +1949,10 @@ } ], "source": [ + "\"\"\"\n", + "Creating a multi-line plot with plotly\n", + "\"\"\"\n", + "\n", "# Filter out the baseline row\n", "df_plot = reg_results_df[reg_results_df['strategy'] != 'none'].copy()\n", "df_plot['parameter'] = df_plot['parameter'].astype(str)\n", From 4d8f60c29dbbab6ba087259aad08f8a724896abf Mon Sep 17 00:00:00 2001 From: eccuraa Date: Tue, 15 Jul 2025 15:56:21 -0400 Subject: [PATCH 54/58] forgot a file --- policyengine_us_data/datasets/cps/enhanced_cps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 3da4f571..59abeafa 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -39,7 +39,7 @@ def reweight( loss_matrix, targets_array, dropout_rate=0.05, - epochs=500, + epochs=250, log_path="calibration_log.csv", penalty_approach=None, penalty_weight=None, @@ -270,7 +270,7 @@ def generate(self): loss_matrix_clean, targets_array_clean, log_path="calibration_log.csv", - epochs=150, + epochs=250, ) data["household_weight"][year] = optimised_weights From 112658f781b2fbf39d9cd1936a009478dab9a5a7 Mon Sep 17 00:00:00 2001 From: eccuraa Date: Tue, 15 Jul 2025 16:00:24 -0400 Subject: [PATCH 55/58] added some headers, just need to add pruning --- test_minimization_approach.ipynb | 45 ++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 6c3921f0..2e0ff269 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -1,5 +1,21 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "037988b0", + "metadata": {}, + "source": [ + "# Testing Arena for Different Regularization Strategies" + ] + }, + { + "cell_type": "markdown", + "id": "268ab898", + "metadata": {}, + "source": [ + "#### Imports" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -52,7 +68,7 @@ "id": "e99994d3", "metadata": {}, "source": [ - "# Enhanced_CPS_2024.py Approaches" + "## Enhanced_CPS_2024.py Regularization Approaches" ] }, { @@ -212,7 +228,7 @@ "id": "69ff392d", "metadata": {}, "source": [ - "# Minimize.py approaches" + "## Minimize.py Regularization Approaches" ] }, { @@ -225,7 +241,17 @@ "name": "stderr", "output_type": "stream", "text": [ - " 91%|█████████ | 228/250 [01:44<00:21, 1.04it/s, loss=3.27e-5, loss_rel_change=-0.384]" + "100%|██████████| 250/250 [01:59<00:00, 2.08it/s, loss=3.47e-5, loss_rel_change=-0.347]\n", + "100%|██████████| 250/250 [01:43<00:00, 2.41it/s, loss=3.27e-5, loss_rel_change=-0.407]\n", + "100%|██████████| 250/250 [02:00<00:00, 2.08it/s, loss=3.22e-5, loss_rel_change=-0.368]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Weight relative change: 99.95%\n", + "Re-calibrating final selected households...\n" ] } ], @@ -278,12 +304,21 @@ " )" ] }, + { + "cell_type": "markdown", + "id": "8568b5ca", + "metadata": {}, + "source": [ + "## Visualization of Results\n", + "Calibration logs can also be shown in María's Vercel dashboard" + ] + }, { "cell_type": "markdown", "id": "f8b0fe2e", "metadata": {}, "source": [ - "### Visualization" + "### Data Scrape for Plotting" ] }, { @@ -693,7 +728,7 @@ "id": "5b203ccd", "metadata": {}, "source": [ - "## Plotting" + "### Plotting" ] }, { From fa9aa02486ebb54b0466fbbe553671c494eacd4d Mon Sep 17 00:00:00 2001 From: eccuraa Date: Wed, 16 Jul 2025 10:29:47 -0400 Subject: [PATCH 56/58] fixed a scraping bug & deleted synthetic data --- test_minimization_approach.ipynb | 456 ++++++++++++------------------- 1 file changed, 168 insertions(+), 288 deletions(-) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 2e0ff269..e9f8eb69 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -18,15 +18,27 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "id": "d6dc9cca", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'prune_dataset' from 'policyengine_us_data.datasets.cps.enhanced_cps' (/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[12], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us_data\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mstorage\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m STORAGE_FOLDER\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Microsimulation\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us_data\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcps\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01menhanced_cps\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m reweight, prune_dataset, ExtendedCPS_2024\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us_data\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m build_loss_matrix\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'prune_dataset' from 'policyengine_us_data.datasets.cps.enhanced_cps' (/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py)" + ] + } + ], "source": [ "from policyengine_us_data.utils.minimize import minimize_dataset, random_sampling_minimization, candidate_loss_contribution\n", "from policyengine_us_data.storage import STORAGE_FOLDER\n", "from policyengine_us import Microsimulation\n", - "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", + "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, prune_dataset, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", "import numpy as np\n", "import os\n", @@ -214,7 +226,10 @@ " penalty_weight=penalty_weight, \n", " epochs=250, # Reduced epochs for faster processing\n", " )\n", - " data[\"household_weight\"][year] = optimised_weights\n", + " keep_indices = prune_dataset(optimised_weights, epsilon=1e-3, method=\"threshold\")\n", + " pruned_weights = optimised_weights[keep_indices]\n", + " \n", + " data[\"household_weight\"][year] = pruned_weights\n", "\n", " # Save to HDF5 file\n", " with h5py.File(h5_path, \"w\") as f:\n", @@ -233,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "aeab67b3", "metadata": {}, "outputs": [ @@ -253,6 +268,117 @@ "Weight relative change: 99.95%\n", "Re-calibrating final selected households...\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250/250 [01:32<00:00, 2.70it/s, loss=3.35e-5, loss_rel_change=-0.359]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Final calibration completed successfully\n", + "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/random_sampling_minimization/0.5_enhanced_cps_2024_minimised.h5\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250/250 [01:45<00:00, 2.38it/s, loss=3.46e-5, loss_rel_change=-0.318]\n", + "100%|██████████| 250/250 [01:42<00:00, 2.44it/s, loss=3.11e-5, loss_rel_change=-0.395]\n", + "100%|██████████| 250/250 [01:46<00:00, 2.35it/s, loss=3.08e-5, loss_rel_change=-0.405]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Weight relative change: 99.99%\n", + "Re-calibrating final selected households...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250/250 [02:18<00:00, 1.80it/s, loss=3.14e-5, loss_rel_change=-0.385]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Final calibration completed successfully\n", + "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/random_sampling_minimization/0.6_enhanced_cps_2024_minimised.h5\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250/250 [02:36<00:00, 1.60it/s, loss=3.29e-5, loss_rel_change=-0.343]\n", + "100%|██████████| 250/250 [3:02:18<00:00, 43.76s/it, loss=3.43e-5, loss_rel_change=-0.578] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration 1: Loss changed from 3.668773852244141e-08 to 3.9001762470775345e-08, which is too high (6.31%). Stopping.\n", + "Weight relative change: 100.00%\n", + "Re-calibrating final selected households...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250/250 [02:07<00:00, 1.95it/s, loss=3.23e-5, loss_rel_change=-0.364]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Final calibration completed successfully\n", + "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/0.001_enhanced_cps_2024_minimised.h5\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250/250 [02:10<00:00, 1.92it/s, loss=3.19e-5, loss_rel_change=-0.372]\n", + "100%|██████████| 250/250 [02:07<00:00, 1.96it/s, loss=3.58e-5, loss_rel_change=-0.556]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration 1: Loss changed from 3.708600229852418e-08 to 3.936675423208132e-08, which is too high (6.15%). Stopping.\n", + "Weight relative change: 100.00%\n", + "Re-calibrating final selected households...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250/250 [02:20<00:00, 1.78it/s, loss=3.22e-5, loss_rel_change=-0.38] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Final calibration completed successfully\n", + "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/0.0001_enhanced_cps_2024_minimised.h5\n" + ] } ], "source": [ @@ -323,7 +449,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 13, "id": "225debd8", "metadata": {}, "outputs": [ @@ -371,7 +497,7 @@ "0 none none 41310 0.0069" ] }, - "execution_count": 62, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -382,6 +508,12 @@ "Creating dataframe to store regularization results\n", "\"\"\"\n", "\n", + "\n", + "def get_output_path(approach, file_name):\n", + " output_path = STORAGE_FOLDER / approach / file_name\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + " return output_path\n", + "\n", "# Calculate quality categories\n", "def loss_score(calibration_log):\n", " excellent_count = (\n", @@ -420,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "7bb3ef3c", "metadata": {}, "outputs": [ @@ -457,210 +589,50 @@ " none\n", " none\n", " 41310\n", - " 0.0069\n", + " 0.006900\n", " \n", " \n", " 1\n", - " l0_sigmoid\n", - " 1.0\n", - " 41310\n", - " 0.0069\n", + " random_sampling_minimization\n", + " 0.5\n", + " 20655\n", + " 80.882353\n", " \n", " \n", " 2\n", - " l0_sigmoid\n", - " 0.1\n", - " 41310\n", - " 39.2959\n", + " random_sampling_minimization\n", + " 0.6\n", + " 24786\n", + " 80.882353\n", " \n", " \n", " 3\n", - " l0_sigmoid\n", - " 0.01\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 4\n", - " l0_sigmoid\n", - " 0.001\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 5\n", - " l0_sigmoid\n", - " 0.0001\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 6\n", - " l0_sigmoid\n", - " 0.00001\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 7\n", - " l0_log\n", - " 1.0\n", - " 41310\n", - " 0.0069\n", - " \n", - " \n", - " 8\n", - " l0_log\n", - " 0.1\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 9\n", - " l0_log\n", - " 0.01\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 10\n", - " l0_log\n", - " 0.001\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 11\n", - " l0_log\n", - " 0.0001\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 12\n", - " l0_log\n", - " 0.00001\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 13\n", - " l0_exp\n", - " 1.0\n", - " 41310\n", - " 0.0069\n", - " \n", - " \n", - " 14\n", - " l0_exp\n", - " 0.1\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 15\n", - " l0_exp\n", - " 0.01\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 16\n", - " l0_exp\n", - " 0.001\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 17\n", - " l0_exp\n", - " 0.0001\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 18\n", - " l0_exp\n", - " 0.00001\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 19\n", - " l1\n", - " 1.0\n", - " 41310\n", - " 0.0069\n", - " \n", - " \n", - " 20\n", - " l1\n", - " 0.1\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 21\n", - " l1\n", - " 0.01\n", - " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 22\n", - " l1\n", + " candidate_loss_contribution\n", " 0.001\n", " 41310\n", - " 39.2959\n", + " 80.882353\n", " \n", " \n", - " 23\n", - " l1\n", + " 4\n", + " candidate_loss_contribution\n", " 0.0001\n", " 41310\n", - " 39.2959\n", - " \n", - " \n", - " 24\n", - " l1\n", - " 0.00001\n", - " 41310\n", - " 39.2959\n", + " 80.882353\n", " \n", " \n", "\n", "" ], "text/plain": [ - " strategy parameter dataset_size total_loss\n", - "0 none none 41310 0.0069\n", - "1 l0_sigmoid 1.0 41310 0.0069\n", - "2 l0_sigmoid 0.1 41310 39.2959\n", - "3 l0_sigmoid 0.01 41310 39.2959\n", - "4 l0_sigmoid 0.001 41310 39.2959\n", - "5 l0_sigmoid 0.0001 41310 39.2959\n", - "6 l0_sigmoid 0.00001 41310 39.2959\n", - "7 l0_log 1.0 41310 0.0069\n", - "8 l0_log 0.1 41310 39.2959\n", - "9 l0_log 0.01 41310 39.2959\n", - "10 l0_log 0.001 41310 39.2959\n", - "11 l0_log 0.0001 41310 39.2959\n", - "12 l0_log 0.00001 41310 39.2959\n", - "13 l0_exp 1.0 41310 0.0069\n", - "14 l0_exp 0.1 41310 39.2959\n", - "15 l0_exp 0.01 41310 39.2959\n", - "16 l0_exp 0.001 41310 39.2959\n", - "17 l0_exp 0.0001 41310 39.2959\n", - "18 l0_exp 0.00001 41310 39.2959\n", - "19 l1 1.0 41310 0.0069\n", - "20 l1 0.1 41310 39.2959\n", - "21 l1 0.01 41310 39.2959\n", - "22 l1 0.001 41310 39.2959\n", - "23 l1 0.0001 41310 39.2959\n", - "24 l1 0.00001 41310 39.2959" + " strategy parameter dataset_size total_loss\n", + "0 none none 41310 0.006900\n", + "1 random_sampling_minimization 0.5 20655 80.882353\n", + "2 random_sampling_minimization 0.6 24786 80.882353\n", + "3 candidate_loss_contribution 0.001 41310 80.882353\n", + "4 candidate_loss_contribution 0.0001 41310 80.882353" ] }, - "execution_count": 63, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -669,7 +641,7 @@ "\"\"\"\n", "Scraping values from created calibration_log.csv and .h5 files to populate the plotting dataframe\n", "\"\"\"\n", - "\n", + "'''\n", "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", "penalty_weights = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]\n", "og_size = 41310 # Original size of the dataset\n", @@ -693,12 +665,11 @@ " loss_value = loss_score(calibration_log)\n", " \n", " reg_results_df = add_result(reg_results_df, strategy, parameter, dataset_size, loss_value)\n", - "\n", - "\n", + "'''\n", "\n", "approaches = {\n", - " \"random_sampling_minimization\":[0.5, 0.6, 0.7, 0.8, 0.9], \n", - " \"candidate_loss_contribution\": [0.001, 0.0001, 0.00001, 0.000001, 0.0000001],\n", + " \"random_sampling_minimization\":[0.5, 0.6], #, 0.7, 0.8, 0.9], \n", + " \"candidate_loss_contribution\": [0.001, 0.0001] #, 0.00001, 0.000001, 0.0000001],\n", "}\n", "\n", "for approach, fractions in approaches.items(): # Use .items() to get key-value pairs\n", @@ -707,14 +678,13 @@ " parameter = fraction\n", "\n", " # Pull length of .h5 file\n", - " h5_name = f\"{fraction}_enhanced_cps_2024_{approach}_minimised.h5\"\n", + " h5_name = f\"{fraction}_enhanced_cps_2024_minimised.h5\"\n", " h5_path = STORAGE_FOLDER / strategy / h5_name\n", " dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n", "\n", " # Pull sum of loss column\n", - " cal_log_name = f\"{fraction}_enhanced_cps_2024_{approach}_minimised_calibration_log.csv\"\n", - " cal_log_path = STORAGE_FOLDER / strategy / cal_log_name\n", - " cal_log_path = get_output_path(strategy, cal_log_name)\n", + " cal_log_name = f\"{fraction}_enhanced_cps_2024_minimised_calibration_log.csv\"\n", + " cal_log_name = get_output_path(strategy, cal_log_name)\n", " calibration_log = pd.read_csv(cal_log_path)\n", " loss_value = loss_score(calibration_log)\n", "\n", @@ -731,96 +701,6 @@ "### Plotting" ] }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9602953a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " strategy parameter dataset_size total_loss\n", - "0 none none 41310 0.0069\n", - "1 l0_sigmoid 1.0 41310 0.0069\n", - "2 l0_sigmoid 0.1 37048 5.8041\n", - "3 l0_sigmoid 0.01 32786 13.3255\n", - "4 l0_sigmoid 0.001 28524 21.6723\n", - "5 l0_sigmoid 0.0001 24262 30.6049\n", - "6 l0_sigmoid 0.00001 20000 40.0000\n", - "7 l0_log 1.0 41310 0.0069\n", - "8 l0_log 0.1 37048 8.7028\n", - "9 l0_log 0.01 32786 19.9847\n", - "10 l0_log 0.001 28524 32.5050\n", - "11 l0_log 0.0001 24262 45.9039\n", - "12 l0_log 0.00001 20000 59.9965\n", - "13 l0_exp 1.0 41310 0.0069\n", - "14 l0_exp 0.1 37048 11.6014\n", - "15 l0_exp 0.01 32786 26.6440\n", - "16 l0_exp 0.001 28524 43.3377\n", - "17 l0_exp 0.0001 24262 61.2029\n", - "18 l0_exp 0.00001 20000 79.9931\n", - "19 l1 1.0 41310 0.0069\n", - "20 l1 0.1 37048 14.5000\n", - "21 l1 0.01 32786 33.3033\n", - "22 l1 0.001 28524 54.1704\n", - "23 l1 0.0001 24262 76.5019\n", - "24 l1 0.00001 20000 99.9896\n" - ] - } - ], - "source": [ - "'''\n", - "Synthetic dataset\n", - "'''\n", - "\n", - "# Define values\n", - "strategies = ['l0_sigmoid', 'l0_log', 'l0_exp', 'l1']\n", - "parameters = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001]\n", - "\n", - "# Synthetic values\n", - "base_size = 41310\n", - "min_size = 20000\n", - "base_loss = 0.0069\n", - "max_loss = 40.0\n", - "\n", - "strategy_slopes = {\n", - " 'l0_sigmoid': 1.0,\n", - " 'l0_log': 1.5,\n", - " 'l0_exp': 2.0,\n", - " 'l1': 2.5,\n", - "}\n", - "\n", - "rows = [{'strategy': 'none', 'parameter': 'none', 'dataset_size': base_size, 'total_loss': base_loss}]\n", - "\n", - "for strategy in strategies:\n", - " slope = strategy_slopes[strategy]\n", - " \n", - " for i, param in enumerate(parameters):\n", - " # Normalized compression level: 0 (no compression) to 1 (max compression)\n", - " compression_level = i / (len(parameters) - 1)\n", - " \n", - " # Size shrinks linearly\n", - " size = int(base_size - (base_size - min_size) * compression_level)\n", - " \n", - " # Loss increases quadratically (or linearly) based on strategy slope\n", - " loss = round(base_loss + slope * (max_loss - base_loss) * (compression_level ** 1.2), 4)\n", - " \n", - " rows.append({\n", - " 'strategy': strategy,\n", - " 'parameter': param,\n", - " 'dataset_size': size,\n", - " 'total_loss': loss\n", - " })\n", - "\n", - "# Create DataFrame\n", - "reg_results_df = pd.DataFrame(rows)\n", - "\n", - "# Display\n", - "print(reg_results_df)" - ] - }, { "cell_type": "code", "execution_count": null, From 791f0d964a49ef8a52545c82641f8c4ccccbca71 Mon Sep 17 00:00:00 2001 From: eccuraa Date: Wed, 16 Jul 2025 10:50:05 -0400 Subject: [PATCH 57/58] fixed a scraping bug --- test_minimization_approach.ipynb | 277 ++++++------------------------- 1 file changed, 52 insertions(+), 225 deletions(-) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index e9f8eb69..4fcb8b91 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -18,27 +18,15 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "id": "d6dc9cca", "metadata": {}, - "outputs": [ - { - "ename": "ImportError", - "evalue": "cannot import name 'prune_dataset' from 'policyengine_us_data.datasets.cps.enhanced_cps' (/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[12], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us_data\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mstorage\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m STORAGE_FOLDER\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Microsimulation\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us_data\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcps\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01menhanced_cps\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m reweight, prune_dataset, ExtendedCPS_2024\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us_data\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m build_loss_matrix\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n", - "\u001b[0;31mImportError\u001b[0m: cannot import name 'prune_dataset' from 'policyengine_us_data.datasets.cps.enhanced_cps' (/Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py)" - ] - } - ], + "outputs": [], "source": [ "from policyengine_us_data.utils.minimize import minimize_dataset, random_sampling_minimization, candidate_loss_contribution\n", "from policyengine_us_data.storage import STORAGE_FOLDER\n", "from policyengine_us import Microsimulation\n", - "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, prune_dataset, ExtendedCPS_2024\n", + "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", "import numpy as np\n", "import os\n", @@ -248,136 +236,44 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 17, "id": "aeab67b3", "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250/250 [01:59<00:00, 2.08it/s, loss=3.47e-5, loss_rel_change=-0.347]\n", - "100%|██████████| 250/250 [01:43<00:00, 2.41it/s, loss=3.27e-5, loss_rel_change=-0.407]\n", - "100%|██████████| 250/250 [02:00<00:00, 2.08it/s, loss=3.22e-5, loss_rel_change=-0.368]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Weight relative change: 99.95%\n", - "Re-calibrating final selected households...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250/250 [01:32<00:00, 2.70it/s, loss=3.35e-5, loss_rel_change=-0.359]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Final calibration completed successfully\n", - "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/random_sampling_minimization/0.5_enhanced_cps_2024_minimised.h5\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250/250 [01:45<00:00, 2.38it/s, loss=3.46e-5, loss_rel_change=-0.318]\n", - "100%|██████████| 250/250 [01:42<00:00, 2.44it/s, loss=3.11e-5, loss_rel_change=-0.395]\n", - "100%|██████████| 250/250 [01:46<00:00, 2.35it/s, loss=3.08e-5, loss_rel_change=-0.405]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Weight relative change: 99.99%\n", - "Re-calibrating final selected households...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250/250 [02:18<00:00, 1.80it/s, loss=3.14e-5, loss_rel_change=-0.385]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Final calibration completed successfully\n", - "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/random_sampling_minimization/0.6_enhanced_cps_2024_minimised.h5\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250/250 [02:36<00:00, 1.60it/s, loss=3.29e-5, loss_rel_change=-0.343]\n", - "100%|██████████| 250/250 [3:02:18<00:00, 43.76s/it, loss=3.43e-5, loss_rel_change=-0.578] \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration 1: Loss changed from 3.668773852244141e-08 to 3.9001762470775345e-08, which is too high (6.31%). Stopping.\n", - "Weight relative change: 100.00%\n", - "Re-calibrating final selected households...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250/250 [02:07<00:00, 1.95it/s, loss=3.23e-5, loss_rel_change=-0.364]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Final calibration completed successfully\n", - "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/0.001_enhanced_cps_2024_minimised.h5\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250/250 [02:10<00:00, 1.92it/s, loss=3.19e-5, loss_rel_change=-0.372]\n", - "100%|██████████| 250/250 [02:07<00:00, 1.96it/s, loss=3.58e-5, loss_rel_change=-0.556]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration 1: Loss changed from 3.708600229852418e-08 to 3.936675423208132e-08, which is too high (6.15%). Stopping.\n", - "Weight relative change: 100.00%\n", - "Re-calibrating final selected households...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250/250 [02:20<00:00, 1.78it/s, loss=3.22e-5, loss_rel_change=-0.38] \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Final calibration completed successfully\n", - "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/0.0001_enhanced_cps_2024_minimised.h5\n" + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[17], line 31\u001b[0m\n\u001b[1;32m 29\u001b[0m output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 30\u001b[0m output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 31\u001b[0m \u001b[43mminimize_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 32\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 33\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 34\u001b[0m \u001b[43m \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 35\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 36\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:336\u001b[0m, in \u001b[0;36mminimize_dataset\u001b[0;34m(dataset, output_path, minimization_function, loss_matrix, targets, **kwargs)\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 334\u001b[0m dataset_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(dataset)\n\u001b[0;32m--> 336\u001b[0m \u001b[43mcreate_calibration_log_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 338\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(dataset_path)\n\u001b[1;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m loss_matrix \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m targets \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:26\u001b[0m, in \u001b[0;36mcreate_calibration_log_file\u001b[0;34m(file_path, epoch)\u001b[0m\n\u001b[1;32m 23\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(file_path)\n\u001b[1;32m 24\u001b[0m sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39mdataset)\n\u001b[0;32m---> 26\u001b[0m loss_matrix, targets \u001b[38;5;241m=\u001b[39m \u001b[43mbuild_loss_matrix\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2024\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 28\u001b[0m bad_mask \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39misin(bad_targets)\n\u001b[1;32m 29\u001b[0m keep_mask_bool \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m~\u001b[39mbad_mask\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/loss.py:243\u001b[0m, in \u001b[0;36mbuild_loss_matrix\u001b[0;34m(dataset, time_period)\u001b[0m\n\u001b[1;32m 241\u001b[0m \u001b[38;5;66;03m# National ACA Spending\u001b[39;00m\n\u001b[1;32m 242\u001b[0m label \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnation/gov/aca_spending\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 243\u001b[0m loss_matrix[label] \u001b[38;5;241m=\u001b[39m \u001b[43msim\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 244\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maca_ptc\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhousehold\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2025\u001b[39;49m\n\u001b[1;32m 245\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 246\u001b[0m ACA_SPENDING_2024 \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m9.8e10\u001b[39m \u001b[38;5;66;03m# 2024 outlays on PTC\u001b[39;00m\n\u001b[1;32m 247\u001b[0m targets_array\u001b[38;5;241m.\u001b[39mappend(ACA_SPENDING_2024)\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 1003\u001b[0m array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/aca/ptc/aca_ptc.py:14\u001b[0m, in \u001b[0;36maca_ptc.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mformula\u001b[39m(tax_unit, period, parameters):\n\u001b[0;32m---> 14\u001b[0m plan_cost \u001b[38;5;241m=\u001b[39m \u001b[43mtax_unit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mslcsp\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 15\u001b[0m income \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maca_magi\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m 16\u001b[0m applicable_figure \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maca_ptc_phase_out_rate\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m 134\u001b[0m variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:681\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 679\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_calculate(variable_name, contained_months[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m])\n\u001b[1;32m 680\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 681\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate_add\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 682\u001b[0m alternate_period_handling \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 683\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable\u001b[38;5;241m.\u001b[39mdefinition_period \u001b[38;5;241m==\u001b[39m YEAR \u001b[38;5;129;01mand\u001b[39;00m period\u001b[38;5;241m.\u001b[39munit \u001b[38;5;241m==\u001b[39m MONTH:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:67\u001b[0m, in \u001b[0;36mMicrosimulation.calculate_add\u001b[0;34m(self, variable_name, period, map_to, use_weights)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mcalculate_add\u001b[39m(\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 62\u001b[0m variable_name: \u001b[38;5;28mstr\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 65\u001b[0m use_weights: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 66\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m MicroSeries:\n\u001b[0;32m---> 67\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate_add\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 68\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 69\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:846\u001b[0m, in \u001b[0;36mSimulation.calculate_add\u001b[0;34m(self, variable_name, period, decode_enums)\u001b[0m\n\u001b[1;32m 835\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable\u001b[38;5;241m.\u001b[39mdefinition_period \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m [\n\u001b[1;32m 836\u001b[0m periods\u001b[38;5;241m.\u001b[39mDAY,\n\u001b[1;32m 837\u001b[0m periods\u001b[38;5;241m.\u001b[39mMONTH,\n\u001b[1;32m 838\u001b[0m periods\u001b[38;5;241m.\u001b[39mYEAR,\n\u001b[1;32m 839\u001b[0m ]:\n\u001b[1;32m 840\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 841\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to sum constant variable \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m over period \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m: only variables defined daily, monthly, or yearly can be summed over time.\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m 842\u001b[0m variable\u001b[38;5;241m.\u001b[39mname, period\n\u001b[1;32m 843\u001b[0m )\n\u001b[1;32m 844\u001b[0m )\n\u001b[0;32m--> 846\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(\n\u001b[1;32m 847\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcalculate(variable_name, sub_period)\n\u001b[1;32m 848\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sub_period \u001b[38;5;129;01min\u001b[39;00m period\u001b[38;5;241m.\u001b[39mget_subperiods(variable\u001b[38;5;241m.\u001b[39mdefinition_period)\n\u001b[1;32m 849\u001b[0m )\n\u001b[1;32m 850\u001b[0m holder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_holder(variable\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m 851\u001b[0m holder\u001b[38;5;241m.\u001b[39mput_in_cache(result, period, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbranch_name)\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:847\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 835\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable\u001b[38;5;241m.\u001b[39mdefinition_period \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m [\n\u001b[1;32m 836\u001b[0m periods\u001b[38;5;241m.\u001b[39mDAY,\n\u001b[1;32m 837\u001b[0m periods\u001b[38;5;241m.\u001b[39mMONTH,\n\u001b[1;32m 838\u001b[0m periods\u001b[38;5;241m.\u001b[39mYEAR,\n\u001b[1;32m 839\u001b[0m ]:\n\u001b[1;32m 840\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 841\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to sum constant variable \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m over period \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m: only variables defined daily, monthly, or yearly can be summed over time.\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m 842\u001b[0m variable\u001b[38;5;241m.\u001b[39mname, period\n\u001b[1;32m 843\u001b[0m )\n\u001b[1;32m 844\u001b[0m )\n\u001b[1;32m 846\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(\n\u001b[0;32m--> 847\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msub_period\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 848\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sub_period \u001b[38;5;129;01min\u001b[39;00m period\u001b[38;5;241m.\u001b[39mget_subperiods(variable\u001b[38;5;241m.\u001b[39mdefinition_period)\n\u001b[1;32m 849\u001b[0m )\n\u001b[1;32m 850\u001b[0m holder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_holder(variable\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m 851\u001b[0m holder\u001b[38;5;241m.\u001b[39mput_in_cache(result, period, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbranch_name)\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:940\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 938\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m adds_list:\n\u001b[1;32m 939\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mvariables:\n\u001b[0;32m--> 940\u001b[0m values \u001b[38;5;241m=\u001b[39m values \u001b[38;5;241m+\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 941\u001b[0m \u001b[43m \u001b[49m\u001b[43madded_variable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mentity\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkey\u001b[49m\n\u001b[1;32m 942\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 943\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 944\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 1003\u001b[0m array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/aca/slspc/slcsp_age_curve_amount_person.py:27\u001b[0m, in \u001b[0;36mslcsp_age_curve_amount_person.formula\u001b[0;34m(person, period, parameters)\u001b[0m\n\u001b[1;32m 19\u001b[0m p \u001b[38;5;241m=\u001b[39m parameters(period)\u001b[38;5;241m.\u001b[39mgov\u001b[38;5;241m.\u001b[39maca\u001b[38;5;241m.\u001b[39mage_curves\n\u001b[1;32m 21\u001b[0m \u001b[38;5;66;03m# Handle other states with regular bracket structures\u001b[39;00m\n\u001b[1;32m 22\u001b[0m multiplier \u001b[38;5;241m=\u001b[39m select(\n\u001b[1;32m 23\u001b[0m [\n\u001b[1;32m 24\u001b[0m state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAL\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 25\u001b[0m state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDC\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 26\u001b[0m state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMA\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m---> 27\u001b[0m \u001b[43mstate_code\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mMN\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m,\n\u001b[1;32m 28\u001b[0m state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMS\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 29\u001b[0m state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOR\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 30\u001b[0m state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUT\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 31\u001b[0m ],\n\u001b[1;32m 32\u001b[0m [\n\u001b[1;32m 33\u001b[0m p\u001b[38;5;241m.\u001b[39mal\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 34\u001b[0m p\u001b[38;5;241m.\u001b[39mdc\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 35\u001b[0m p\u001b[38;5;241m.\u001b[39mma\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 36\u001b[0m p\u001b[38;5;241m.\u001b[39mmn\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 37\u001b[0m p\u001b[38;5;241m.\u001b[39mms\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 38\u001b[0m p[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 39\u001b[0m p\u001b[38;5;241m.\u001b[39mut\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 40\u001b[0m ],\n\u001b[1;32m 41\u001b[0m default\u001b[38;5;241m=\u001b[39mp\u001b[38;5;241m.\u001b[39mdefault\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 42\u001b[0m )\n\u001b[1;32m 43\u001b[0m age_curve_applies \u001b[38;5;241m=\u001b[39m person\u001b[38;5;241m.\u001b[39mtax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mslcsp_age_curve_applies\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m base_cost \u001b[38;5;241m*\u001b[39m multiplier \u001b[38;5;241m*\u001b[39m age_curve_applies\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], @@ -395,10 +291,10 @@ "\n", "optional_params = {\n", " \"random_sampling_minimization\": {\n", - " \"target_fractions\": [0.5, 0.6]#, 0.7, 0.8, 0.9], # fractions of the dataset to keep\n", + " \"target_fractions\": [0.7, 0.8, 0.9]#, 0.5, 0.6]], # fractions of the dataset to keep\n", " },\n", " \"candidate_loss_contribution\": {\n", - " \"loss_rel_change_max\": [0.001, 0.0001]#, 0.00001, 0.000001, 0.0000001] # maximum relative change in loss\n", + " \"loss_rel_change_max\": [0.00001, 0.000001, 0.0000001]#, 0.001, 0.0001]] # maximum relative change in loss\n", " }\n", "}\n", "\n", @@ -449,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 25, "id": "225debd8", "metadata": {}, "outputs": [ @@ -497,7 +393,7 @@ "0 none none 41310 0.0069" ] }, - "execution_count": 13, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -552,89 +448,20 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "7bb3ef3c", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
strategyparameterdataset_sizetotal_loss
0nonenone413100.006900
1random_sampling_minimization0.52065580.882353
2random_sampling_minimization0.62478680.882353
3candidate_loss_contribution0.0014131080.882353
4candidate_loss_contribution0.00014131080.882353
\n", - "
" - ], - "text/plain": [ - " strategy parameter dataset_size total_loss\n", - "0 none none 41310 0.006900\n", - "1 random_sampling_minimization 0.5 20655 80.882353\n", - "2 random_sampling_minimization 0.6 24786 80.882353\n", - "3 candidate_loss_contribution 0.001 41310 80.882353\n", - "4 candidate_loss_contribution 0.0001 41310 80.882353" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" + "ename": "AttributeError", + "evalue": "module 'h5py' has no attribute 'Files'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[26], line 43\u001b[0m\n\u001b[1;32m 41\u001b[0m h5_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfraction\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 42\u001b[0m h5_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m strategy \u001b[38;5;241m/\u001b[39m h5_name\n\u001b[0;32m---> 43\u001b[0m dataset_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[43mh5py\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mFiles\u001b[49m(h5_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhousehold_weight/2024\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m 45\u001b[0m \u001b[38;5;66;03m# Pull sum of loss column\u001b[39;00m\n\u001b[1;32m 46\u001b[0m cal_log_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfraction\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised_calibration_log.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "\u001b[0;31mAttributeError\u001b[0m: module 'h5py' has no attribute 'Files'" + ] } ], "source": [ @@ -684,7 +511,7 @@ "\n", " # Pull sum of loss column\n", " cal_log_name = f\"{fraction}_enhanced_cps_2024_minimised_calibration_log.csv\"\n", - " cal_log_name = get_output_path(strategy, cal_log_name)\n", + " cal_log_path = get_output_path(strategy, cal_log_name)\n", " calibration_log = pd.read_csv(cal_log_path)\n", " loss_value = loss_score(calibration_log)\n", "\n", From 9520b16ce9777cfe736db893c0a35c92a19dce7a Mon Sep 17 00:00:00 2001 From: eccuraa Date: Wed, 16 Jul 2025 11:03:48 -0400 Subject: [PATCH 58/58] added pruning to L0, L1 approaches (and discovered candidate_loss approach is not being pruned yet either) --- .../datasets/cps/enhanced_cps.py | 16 +- test_minimization_approach.ipynb | 421 ++++++------------ 2 files changed, 143 insertions(+), 294 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 59abeafa..2fbb0293 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -39,7 +39,7 @@ def reweight( loss_matrix, targets_array, dropout_rate=0.05, - epochs=250, + epochs=150, log_path="calibration_log.csv", penalty_approach=None, penalty_weight=None, @@ -108,11 +108,21 @@ def loss( if penalty_approach == "l1": l1 = torch.mean(weights) return rel_error_normalized.mean() + penalty_weight * l1 - + return rel_error_normalized.mean() + penalty_weight * smoothed_l0 else: return rel_error_normalized.mean() + + def prune_dataset(weights, epsilon=1e-3): + """ + Prune dataset samples based on learned weights. + Returns indices of samples to keep. + """ + importance_scores = weights.detach().cpu().numpy() + keep_indices = np.where(importance_scores > epsilon)[0] + + return keep_indices def dropout_weights(weights, p): if p == 0: @@ -270,7 +280,7 @@ def generate(self): loss_matrix_clean, targets_array_clean, log_path="calibration_log.csv", - epochs=250, + epochs= 150, ) data["household_weight"][year] = optimised_weights diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 4fcb8b91..a4bd87be 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 34, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -76,100 +76,15 @@ "execution_count": null, "id": "db975ac1", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 10/10 [00:03<00:00, 3.00it/s, loss=9.1e-5, loss_rel_change=-0.809] \n", - "100%|██████████| 10/10 [00:03<00:00, 2.96it/s, loss=0.000181, loss_rel_change=-0.679]\n", - "100%|██████████| 10/10 [00:03<00:00, 2.98it/s, loss=0.00108, loss_rel_change=-0.273]\n", - "100%|██████████| 10/10 [00:02<00:00, 3.59it/s, loss=0.0101, loss_rel_change=-0.0377]\n", - "100%|██████████| 10/10 [00:02<00:00, 3.46it/s, loss=0.1, loss_rel_change=-0.00391]\n", - "100%|██████████| 10/10 [00:02<00:00, 3.52it/s, loss=0.000191, loss_rel_change=-0.672]\n", - "100%|██████████| 10/10 [00:03<00:00, 2.89it/s, loss=0.00116, loss_rel_change=-0.274]\n", - "100%|██████████| 10/10 [00:02<00:00, 3.57it/s, loss=0.00978, loss_rel_change=-0.166]\n", - "100%|██████████| 10/10 [00:02<00:00, 3.34it/s, loss=0.0881, loss_rel_change=-0.22]\n", - "100%|██████████| 10/10 [00:02<00:00, 3.55it/s, loss=0.866, loss_rel_change=-0.23]\n", - "100%|██████████| 10/10 [00:03<00:00, 3.31it/s, loss=9.12e-5, loss_rel_change=-0.812]\n", - "100%|██████████| 10/10 [00:03<00:00, 3.26it/s, loss=0.00018, loss_rel_change=-0.687]\n", - "100%|██████████| 10/10 [00:02<00:00, 3.47it/s, loss=0.00108, loss_rel_change=-0.263]\n", - "100%|██████████| 10/10 [00:03<00:00, 3.21it/s, loss=0.0101, loss_rel_change=-0.0373]\n", - "100%|██████████| 10/10 [00:02<00:00, 3.37it/s, loss=0.1, loss_rel_change=-0.00383]\n", - "100%|██████████| 10/10 [00:03<00:00, 3.28it/s, loss=0.00389, loss_rel_change=-0.875]\n", - "100%|██████████| 10/10 [00:03<00:00, 3.17it/s, loss=0.0328, loss_rel_change=-0.894]\n", - "100%|██████████| 10/10 [00:03<00:00, 2.72it/s, loss=0.321, loss_rel_change=-0.896]\n", - "100%|██████████| 10/10 [00:02<00:00, 3.49it/s, loss=3.21, loss_rel_change=-0.896]\n", - "100%|██████████| 10/10 [00:02<00:00, 3.37it/s, loss=32.1, loss_rel_change=-0.896]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "=== CALIBRATION LOG DEBUG ===\n", - "File path: /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/enhanced_cps_2024.h5\n", - "Epoch: 0\n", - "Number of households: 41310\n", - "Total weight: 12764381616743.21\n", - "Weight range: 0.54 to 1303728.75\n", - "Loss matrix shape: (41310, 2813)\n", - "Number of targets: 2813\n", - "After filtering bad targets:\n", - "Loss matrix clean shape: (41310, 2805)\n", - "Number of clean targets: 2805\n", - "Estimates shape: (2805,)\n", - "Estimates sum: 324584770671300.88\n", - "First 3 estimates: nation/irs/adjusted gross income/total/AGI in -inf-inf/taxable/All 1.498784e+13\n", - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/All 1.609638e+10\n", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/All 6.707770e+10\n", - "dtype: float64\n", - "First 3 targets: [1.62972204e+13 1.68634879e+10 6.76819729e+10]\n", - "Mean absolute error: 17235490830.73\n", - "Mean relative error: 0.0997\n", - "=== END DEBUG ===\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250/250 [01:38<00:00, 2.54it/s, loss=3.62e-5, loss_rel_change=-0.301]\n", - "100%|██████████| 250/250 [01:35<00:00, 2.62it/s, loss=3.58e-5, loss_rel_change=-0.294]\n", - "100%|██████████| 250/250 [01:33<00:00, 2.68it/s, loss=3.34e-5, loss_rel_change=-0.376]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Weight relative change: 99.95%\n", - "Re-calibrating final selected households...\n" - ] - }, - { - "ename": "AttributeError", - "evalue": "'numpy.ndarray' object has no attribute 'columns'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[3], line 92\u001b[0m\n\u001b[1;32m 90\u001b[0m output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 91\u001b[0m output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 92\u001b[0m \u001b[43mminimise_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 93\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 94\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 95\u001b[0m \u001b[43m \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 96\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 97\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimise.py:430\u001b[0m, in \u001b[0;36mminimise_dataset\u001b[0;34m(dataset, output_path, minimization_function, **kwargs)\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[38;5;66;03m# Re-calibrate the final selected households to hit targets\u001b[39;00m\n\u001b[1;32m 429\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRe-calibrating final selected households...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 430\u001b[0m calibrated_weights \u001b[38;5;241m=\u001b[39m \u001b[43mreweight\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[43minitial_weights\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mloss_matrix_clean\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Convert to numpy array\u001b[39;49;00m\n\u001b[1;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mtargets_clean\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m250\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Reduced epochs for faster processing\u001b[39;49;00m\n\u001b[1;32m 435\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 436\u001b[0m sim\u001b[38;5;241m.\u001b[39mset_input(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m2024\u001b[39m, calibrated_weights)\n\u001b[1;32m 437\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFinal calibration completed successfully\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/enhanced_cps.py:47\u001b[0m, in \u001b[0;36mreweight\u001b[0;34m(original_weights, loss_matrix, targets_array, dropout_rate, epochs, log_path, penalty_approach, penalty_weight)\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mreweight\u001b[39m(\n\u001b[1;32m 38\u001b[0m original_weights,\n\u001b[1;32m 39\u001b[0m loss_matrix,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 45\u001b[0m penalty_weight\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 46\u001b[0m ):\n\u001b[0;32m---> 47\u001b[0m target_names \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray(\u001b[43mloss_matrix\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m)\n\u001b[1;32m 48\u001b[0m is_national \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnation/\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 49\u001b[0m loss_matrix \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(loss_matrix\u001b[38;5;241m.\u001b[39mvalues, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n", - "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'" - ] - } - ], + "outputs": [], "source": [ "## ALL TESTS\n", "\n", "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", "input_dataset = ExtendedCPS_2024\n", "\n", - "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", - "penalty_weights = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]\n", + "approaches = [\"l0_sigmoid\"]#, \"l0_log\", \"l0_exp\", \"l1\"]\n", + "penalty_weights = [1e-5]#, 1e-4, 1e-3, 1e-2, 1e-1]\n", "\n", "def get_output_path(approach, file_name):\n", " output_path = STORAGE_FOLDER / approach / file_name\n", @@ -214,7 +129,7 @@ " penalty_weight=penalty_weight, \n", " epochs=250, # Reduced epochs for faster processing\n", " )\n", - " keep_indices = prune_dataset(optimised_weights, epsilon=1e-3, method=\"threshold\")\n", + " keep_indices = prune_dataset(optimised_weights, epsilon=1e-3)\n", " pruned_weights = optimised_weights[keep_indices]\n", " \n", " data[\"household_weight\"][year] = pruned_weights\n", @@ -345,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 30, "id": "225debd8", "metadata": {}, "outputs": [ @@ -393,7 +308,7 @@ "0 none none 41310 0.0069" ] }, - "execution_count": 25, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -453,27 +368,113 @@ "metadata": {}, "outputs": [ { - "ename": "AttributeError", - "evalue": "module 'h5py' has no attribute 'Files'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[26], line 43\u001b[0m\n\u001b[1;32m 41\u001b[0m h5_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfraction\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 42\u001b[0m h5_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m strategy \u001b[38;5;241m/\u001b[39m h5_name\n\u001b[0;32m---> 43\u001b[0m dataset_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[43mh5py\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mFiles\u001b[49m(h5_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhousehold_weight/2024\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m 45\u001b[0m \u001b[38;5;66;03m# Pull sum of loss column\u001b[39;00m\n\u001b[1;32m 46\u001b[0m cal_log_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfraction\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised_calibration_log.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", - "\u001b[0;31mAttributeError\u001b[0m: module 'h5py' has no attribute 'Files'" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
strategyparameterdataset_sizetotal_loss
0nonenone413100.006900
1candidate_loss_contribution1.0413100.006900
2random_sampling_minimization0.52065580.882353
3random_sampling_minimization0.62478679.117647
4random_sampling_minimization1.0413100.006900
5candidate_loss_contribution0.0014131077.647059
6candidate_loss_contribution0.00014131080.196078
\n", + "
" + ], + "text/plain": [ + " strategy parameter dataset_size total_loss\n", + "0 none none 41310 0.006900\n", + "1 candidate_loss_contribution 1.0 41310 0.006900\n", + "2 random_sampling_minimization 0.5 20655 80.882353\n", + "3 random_sampling_minimization 0.6 24786 79.117647\n", + "4 random_sampling_minimization 1.0 41310 0.006900\n", + "5 candidate_loss_contribution 0.001 41310 77.647059\n", + "6 candidate_loss_contribution 0.0001 41310 80.196078" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Scraping values from created calibration_log.csv and .h5 files to populate the plotting dataframe\n", "\"\"\"\n", - "'''\n", - "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", - "penalty_weights = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]\n", + "\n", "og_size = 41310 # Original size of the dataset\n", "og_loss = 6.9e-3 # Original loss from the baseline dataset\n", "\n", + "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", + "penalty_weights = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]\n", + "\n", "for approach in approaches:\n", " strategy = approach\n", " reg_results_df = add_result(reg_results_df, strategy, 1.0, og_size, og_loss)\n", @@ -492,7 +493,6 @@ " loss_value = loss_score(calibration_log)\n", " \n", " reg_results_df = add_result(reg_results_df, strategy, parameter, dataset_size, loss_value)\n", - "'''\n", "\n", "approaches = {\n", " \"random_sampling_minimization\":[0.5, 0.6], #, 0.7, 0.8, 0.9], \n", @@ -500,6 +500,7 @@ "}\n", "\n", "for approach, fractions in approaches.items(): # Use .items() to get key-value pairs\n", + " reg_results_df = add_result(reg_results_df, strategy, 1.0, og_size, og_loss)\n", " for fraction in fractions:\n", " strategy = approach\n", " parameter = fraction\n", @@ -530,7 +531,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "2dc0891c", "metadata": {}, "outputs": [ @@ -544,26 +545,17 @@ { "customdata": [ [ - "l0_exp" - ], - [ - "l0_exp" - ], - [ - "l0_exp" - ], - [ - "l0_exp" + "candidate_loss_contribution" ], [ - "l0_exp" + "candidate_loss_contribution" ], [ - "l0_exp" + "candidate_loss_contribution" ] ], "hovertemplate": "Strategy: %{customdata[0]}
Size: %{x}
Loss: %{y:.4f}
Param: %{text}", - "legendgroup": "l0_exp", + "legendgroup": "candidate_loss_contribution", "line": { "color": "#636efa", "dash": "solid" @@ -572,224 +564,71 @@ "symbol": "circle" }, "mode": "lines+markers+text", - "name": "l0_exp", + "name": "candidate_loss_contribution", "orientation": "v", "showlegend": true, "text": [ "1.0", - "0.1", - "0.01", "0.001", - "0.0001", - "1e-05" + "0.0001" ], "textposition": "top center", "type": "scatter", "x": [ 41310, - 37048, - 32786, - 28524, - 24262, - 20000 - ], - "xaxis": "x", - "y": [ - 0.0069, - 11.6014, - 26.644, - 43.3377, - 61.2029, - 79.9931 - ], - "yaxis": "y" - }, - { - "customdata": [ - [ - "l0_log" - ], - [ - "l0_log" - ], - [ - "l0_log" - ], - [ - "l0_log" - ], - [ - "l0_log" - ], - [ - "l0_log" - ] - ], - "hovertemplate": "Strategy: %{customdata[0]}
Size: %{x}
Loss: %{y:.4f}
Param: %{text}", - "legendgroup": "l0_log", - "line": { - "color": "#EF553B", - "dash": "solid" - }, - "marker": { - "symbol": "circle" - }, - "mode": "lines+markers+text", - "name": "l0_log", - "orientation": "v", - "showlegend": true, - "text": [ - "1.0", - "0.1", - "0.01", - "0.001", - "0.0001", - "1e-05" - ], - "textposition": "top center", - "type": "scatter", - "x": [ 41310, - 37048, - 32786, - 28524, - 24262, - 20000 + 41310 ], "xaxis": "x", "y": [ 0.0069, - 8.7028, - 19.9847, - 32.505, - 45.9039, - 59.9965 + 77.6470588235294, + 80.19607843137256 ], "yaxis": "y" }, { "customdata": [ [ - "l0_sigmoid" + "random_sampling_minimization" ], [ - "l0_sigmoid" + "random_sampling_minimization" ], [ - "l0_sigmoid" - ], - [ - "l0_sigmoid" - ], - [ - "l0_sigmoid" - ], - [ - "l0_sigmoid" + "random_sampling_minimization" ] ], "hovertemplate": "Strategy: %{customdata[0]}
Size: %{x}
Loss: %{y:.4f}
Param: %{text}", - "legendgroup": "l0_sigmoid", + "legendgroup": "random_sampling_minimization", "line": { - "color": "#00cc96", - "dash": "solid" - }, - "marker": { - "symbol": "circle" - }, - "mode": "lines+markers+text", - "name": "l0_sigmoid", - "orientation": "v", - "showlegend": true, - "text": [ - "1.0", - "0.1", - "0.01", - "0.001", - "0.0001", - "1e-05" - ], - "textposition": "top center", - "type": "scatter", - "x": [ - 41310, - 37048, - 32786, - 28524, - 24262, - 20000 - ], - "xaxis": "x", - "y": [ - 0.0069, - 5.8041, - 13.3255, - 21.6723, - 30.6049, - 40 - ], - "yaxis": "y" - }, - { - "customdata": [ - [ - "l1" - ], - [ - "l1" - ], - [ - "l1" - ], - [ - "l1" - ], - [ - "l1" - ], - [ - "l1" - ] - ], - "hovertemplate": "Strategy: %{customdata[0]}
Size: %{x}
Loss: %{y:.4f}
Param: %{text}", - "legendgroup": "l1", - "line": { - "color": "#ab63fa", + "color": "#EF553B", "dash": "solid" }, "marker": { "symbol": "circle" }, "mode": "lines+markers+text", - "name": "l1", + "name": "random_sampling_minimization", "orientation": "v", "showlegend": true, "text": [ "1.0", - "0.1", - "0.01", - "0.001", - "0.0001", - "1e-05" + "0.6", + "0.5" ], "textposition": "top center", "type": "scatter", "x": [ 41310, - 37048, - 32786, - 28524, - 24262, - 20000 + 24786, + 20655 ], "xaxis": "x", "y": [ 0.0069, - 14.5, - 33.3033, - 54.1704, - 76.5019, - 99.9896 + 79.11764705882354, + 80.88235294117646 ], "yaxis": "y" } @@ -828,7 +667,7 @@ "x0": 41310, "x1": 41310, "y0": 0.0069, - "y1": 99.9896 + "y1": 80.88235294117646 }, { "line": { @@ -837,7 +676,7 @@ }, "name": "Baseline Loss", "type": "line", - "x0": 20000, + "x0": 20655, "x1": 41310, "y0": 0.0069, "y1": 0.0069