From 4c0f1c611e85436fe5b1c0e1c87deb386846d761 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 10 Jul 2025 21:20:39 +0100 Subject: [PATCH 01/56] Shrink datasets --- policyengine_us_data/utils/minimise.py | 85 ++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 policyengine_us_data/utils/minimise.py diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py new file mode 100644 index 00000000..4355e889 --- /dev/null +++ b/policyengine_us_data/utils/minimise.py @@ -0,0 +1,85 @@ +from policyengine_us_data.utils.loss import build_loss_matrix +from policyengine_core.data import Dataset +from policyengine_us import Microsimulation +import numpy as np +import pandas as pd + +def minimise_dataset(dataset, output_path: str, loss_rel_change_max: float) -> None: + # if loading from a .h5 file, need to do dataset = Dataset.from_file(dataset) + loss_matrix = build_loss_matrix(dataset, 2024) + + sim = Microsimulation(dataset=dataset) + + weights = sim.calculate("household_weight", 2024).values + estimate_matrix, targets = loss_matrix + is_national = estimate_matrix.columns.str.startswith("nation/") + nation_normalisation_factor = is_national * (1 / is_national.sum()) + state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) + normalisation_factor = np.where( + is_national, nation_normalisation_factor, state_normalisation_factor + ) + weights @ estimate_matrix + + def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor): + """ + Calculate the loss based on the inclusion mask and the estimate matrix. + """ + masked_weights = weights.copy() + original_weight_total = masked_weights.sum() + masked_weights[~inclusion_mask] = 0 + masked_weight_total = masked_weights.sum() + masked_weights[inclusion_mask] *= original_weight_total / masked_weight_total + estimates = masked_weights @ estimate_matrix + rel_error = ((estimates - targets) + 1) / (targets + 1) + loss = ((rel_error * normalisation_factor) ** 2).mean() + + return loss + + COUNT_ITERATIONS = 5 + FRACTION_REMOVE_PER_ITERATION = 0.1 + from tqdm import tqdm + + full_mask = np.ones_like(weights, dtype=bool) + for i in range(COUNT_ITERATIONS): + inclusion_mask = full_mask.copy() + baseline_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor) + household_loss_rel_changes = [] + for household_index in tqdm(range(len(weights))): + # Skip if this household is already excluded + if not inclusion_mask[household_index]: + household_loss_rel_changes.append(np.inf) + continue + # Calculate loss if this household is removed + inclusion_mask = inclusion_mask.copy() + inclusion_mask[household_index] = False + loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor) + rel_change = (loss - baseline_loss) / baseline_loss + household_loss_rel_changes.append(rel_change) + inclusion_mask = full_mask.copy() + household_loss_rel_changes = np.array(household_loss_rel_changes) + # Sort by the relative change in loss + sorted_indices = np.argsort(household_loss_rel_changes) + # Remove the worst households + num_to_remove = int(len(weights) * FRACTION_REMOVE_PER_ITERATION) + worst_indices = sorted_indices[:num_to_remove] + inclusion_mask[worst_indices] = False + # Calculate the new loss + new_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor) + print(f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}") + print(f"Removed {num_to_remove} households with worst relative loss changes.") + # Update the full mask + full_mask &= inclusion_mask + + household_ids = sim.calculate("household_id", 2024).values + remaining_households = household_ids[full_mask] + + # At this point we have a mask of households to keep + + # I'm saving to a csv for ease of debugging, but we need to save to a .h5 file + + df = sim.to_input_dataframe() + df = df[df["household_id__2024"].isin(remaining_households)] + + df.to_csv(output_path, index=False) + + return df \ No newline at end of file From 6b2a56f6f8a55aacb4ee9e305bd53c74f36c70b0 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 10 Jul 2025 22:25:27 +0100 Subject: [PATCH 02/56] Move to package --- Makefile | 1 + .../storage/upload_completed_datasets.py | 1 + policyengine_us_data/utils/minimise.py | 127 +++++++++++++++--- 3 files changed, 114 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index 788ba1d3..90b2817a 100644 --- a/Makefile +++ b/Makefile @@ -46,6 +46,7 @@ data: python policyengine_us_data/datasets/cps/extended_cps.py python policyengine_us_data/datasets/cps/enhanced_cps.py python policyengine_us_data/datasets/cps/small_enhanced_cps.py + python policyengine_us_data/utils/minimise.py clean: rm -f policyengine_us_data/storage/*.h5 diff --git a/policyengine_us_data/storage/upload_completed_datasets.py b/policyengine_us_data/storage/upload_completed_datasets.py index f161a9ee..16885d8c 100644 --- a/policyengine_us_data/storage/upload_completed_datasets.py +++ b/policyengine_us_data/storage/upload_completed_datasets.py @@ -15,6 +15,7 @@ def upload_datasets(): Pooled_3_Year_CPS_2023.file_path, CPS_2023.file_path, STORAGE_FOLDER / "small_enhanced_cps_2024.h5", + STORAGE_FOLDER / "enhanced_cps_2024_minified.h5", ] for file_path in dataset_files: diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 4355e889..6fe511fd 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -3,9 +3,43 @@ from policyengine_us import Microsimulation import numpy as np import pandas as pd +import h5py +from policyengine_us_data.storage import STORAGE_FOLDER + + +def create_calibration_log_file(file_path): + dataset = Dataset.from_file(file_path) + + loss_matrix = build_loss_matrix(dataset, 2024) + + sim = Microsimulation(dataset=dataset) + + estimates = sim.calculate("household_weight", 2024).values @ loss_matrix[0] + target_names = loss_matrix[0].columns + target_values = loss_matrix[1] + + df = pd.DataFrame( + { + "target_name": target_names, + "estimate": estimates, + "target": target_values, + } + ) + df["epoch"] = 0 + df["error"] = df["estimate"] - df["target"] + df["rel_error"] = df["error"] / df["target"] + df["abs_error"] = df["error"].abs() + df["rel_abs_error"] = df["abs_error"] / df["target"].abs() + df["loss"] = (df["rel_error"] ** 2).mean() + + df.to_csv(file_path.replace(".h5", "_calibration_log.csv"), index=False) + + +def minimise_dataset( + dataset, output_path: str, loss_rel_change_max: float +) -> None: + create_calibration_log_file(dataset) -def minimise_dataset(dataset, output_path: str, loss_rel_change_max: float) -> None: - # if loading from a .h5 file, need to do dataset = Dataset.from_file(dataset) loss_matrix = build_loss_matrix(dataset, 2024) sim = Microsimulation(dataset=dataset) @@ -20,15 +54,20 @@ def minimise_dataset(dataset, output_path: str, loss_rel_change_max: float) -> N ) weights @ estimate_matrix - def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor): + def get_loss_from_mask( + inclusion_mask, estimate_matrix, targets, normalisation_factor + ): """ Calculate the loss based on the inclusion mask and the estimate matrix. """ masked_weights = weights.copy() original_weight_total = masked_weights.sum() - masked_weights[~inclusion_mask] = 0 + if (~inclusion_mask).sum() > 0: + masked_weights[~inclusion_mask] = 0 masked_weight_total = masked_weights.sum() - masked_weights[inclusion_mask] *= original_weight_total / masked_weight_total + masked_weights[inclusion_mask] *= ( + original_weight_total / masked_weight_total + ) estimates = masked_weights @ estimate_matrix rel_error = ((estimates - targets) + 1) / (targets + 1) loss = ((rel_error * normalisation_factor) ** 2).mean() @@ -36,15 +75,23 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f return loss COUNT_ITERATIONS = 5 + VIEW_FRACTION_PER_ITERATION = 0.3 FRACTION_REMOVE_PER_ITERATION = 0.1 from tqdm import tqdm full_mask = np.ones_like(weights, dtype=bool) for i in range(COUNT_ITERATIONS): inclusion_mask = full_mask.copy() - baseline_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor) + baseline_loss = get_loss_from_mask( + inclusion_mask, estimate_matrix, targets, normalisation_factor + ) household_loss_rel_changes = [] - for household_index in tqdm(range(len(weights))): + indices = np.random.choice( + np.arange(len(weights)), + size=int(len(weights) * VIEW_FRACTION_PER_ITERATION), + replace=False, + ) + for household_index in tqdm(indices): # Skip if this household is already excluded if not inclusion_mask[household_index]: household_loss_rel_changes.append(np.inf) @@ -52,7 +99,9 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f # Calculate loss if this household is removed inclusion_mask = inclusion_mask.copy() inclusion_mask[household_index] = False - loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor) + loss = get_loss_from_mask( + inclusion_mask, estimate_matrix, targets, normalisation_factor + ) rel_change = (loss - baseline_loss) / baseline_loss household_loss_rel_changes.append(rel_change) inclusion_mask = full_mask.copy() @@ -64,12 +113,24 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f worst_indices = sorted_indices[:num_to_remove] inclusion_mask[worst_indices] = False # Calculate the new loss - new_loss = get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_factor) - print(f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}") - print(f"Removed {num_to_remove} households with worst relative loss changes.") + new_loss = get_loss_from_mask( + inclusion_mask, estimate_matrix, targets, normalisation_factor + ) + rel_change = (new_loss - baseline_loss) / baseline_loss + if rel_change > loss_rel_change_max: + print( + f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, which is too high ({rel_change:.2%}). Stopping." + ) + break + print( + f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}" + ) + print( + f"Removed {num_to_remove} households with worst relative loss changes." + ) # Update the full mask full_mask &= inclusion_mask - + household_ids = sim.calculate("household_id", 2024).values remaining_households = household_ids[full_mask] @@ -78,8 +139,44 @@ def get_loss_from_mask(inclusion_mask, estimate_matrix, targets, normalisation_f # I'm saving to a csv for ease of debugging, but we need to save to a .h5 file df = sim.to_input_dataframe() - df = df[df["household_id__2024"].isin(remaining_households)] + smaller_df = df[df["household_id__2024"].isin(remaining_households)] + + weight_rel_change = ( + smaller_df["household_weight__2024"].sum() + / df["household_weight__2024"].sum() + ) + print(f"Weight relative change: {weight_rel_change:.2%}") + + sim = Microsimulation(dataset=smaller_df) + + sim.set_input( + "household_weight", + 2024, + sim.calculate("household_weight", 2024).values / weight_rel_change, + ) + + data = {} + + for variable in sim.input_variables: + data[variable] = {2024: sim.calculate(variable, 2024).values} + if data[variable][2024].dtype == "object": + data[variable][2024] = data[variable][2024].astype("S") + + with h5py.File(output_path, "w") as f: + for variable, values in data.items(): + for year, value in values.items(): + f.create_dataset(f"{variable}/{year}", data=value) + print(f"Saved minimised dataset to {output_path}") + + create_calibration_log_file(output_path) + - df.to_csv(output_path, index=False) +if __name__ == "__main__": + # Example usage + files = [ + STORAGE_FOLDER / "enhanced_cps_2024.h5", + ] - return df \ No newline at end of file + for file in files: + output_path = file.with_name(file.stem + "_minimised.h5") + minimise_dataset(file, output_path, loss_rel_change_max=10) From 05ee7e4075293057756d24da0e23b36a6cfe3465 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 10 Jul 2025 23:50:52 +0100 Subject: [PATCH 03/56] Try L0 --- Makefile | 1 - .../datasets/cps/enhanced_cps.py | 18 +++++++++++++++++- policyengine_us_data/utils/minimise.py | 4 +++- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 90b2817a..788ba1d3 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,6 @@ data: python policyengine_us_data/datasets/cps/extended_cps.py python policyengine_us_data/datasets/cps/enhanced_cps.py python policyengine_us_data/datasets/cps/small_enhanced_cps.py - python policyengine_us_data/utils/minimise.py clean: rm -f policyengine_us_data/storage/*.h5 diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index b8af12ce..9e61414c 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -59,9 +59,25 @@ def loss(weights): ((estimate - targets_array) + 1) / (targets_array + 1) ) ** 2 rel_error_normalized = rel_error * normalisation_factor + + # L0 penalty (approximated with smooth function) + # Since L0 is non-differentiable, we use a smooth approximation + # Common approaches: + + # Option 1: Sigmoid approximation + epsilon = 1e-3 # Threshold for "near zero" + l0_penalty_weight = 1e-1 # Adjust this hyperparameter + smoothed_l0 = torch.sigmoid((weights - epsilon) / (epsilon * 0.1)).mean() + + # Option 2: Log-sum penalty (smoother) + # smoothed_l0 = torch.log(1 + actual_weights / epsilon).sum() / len(actual_weights) + + # Option 3: Exponential penalty + # smoothed_l0 = (1 - torch.exp(-actual_weights / epsilon)).mean() + if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") - return rel_error_normalized.mean() + return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 def dropout_weights(weights, p): if p == 0: diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 6fe511fd..2b122fec 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -32,14 +32,16 @@ def create_calibration_log_file(file_path): df["rel_abs_error"] = df["abs_error"] / df["target"].abs() df["loss"] = (df["rel_error"] ** 2).mean() - df.to_csv(file_path.replace(".h5", "_calibration_log.csv"), index=False) + df.to_csv(str(file_path).replace(".h5", "_calibration_log.csv"), index=False) def minimise_dataset( dataset, output_path: str, loss_rel_change_max: float ) -> None: + dataset = str(dataset) create_calibration_log_file(dataset) + dataset = Dataset.from_file(dataset) loss_matrix = build_loss_matrix(dataset, 2024) sim = Microsimulation(dataset=dataset) From e38c6479483c9b2fb0cca9939c881995267a10d7 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 10 Jul 2025 23:54:02 +0100 Subject: [PATCH 04/56] Format --- policyengine_us_data/datasets/cps/enhanced_cps.py | 10 ++++++---- policyengine_us_data/utils/minimise.py | 4 +++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 9e61414c..7d81a0c0 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -63,15 +63,17 @@ def loss(weights): # L0 penalty (approximated with smooth function) # Since L0 is non-differentiable, we use a smooth approximation # Common approaches: - + # Option 1: Sigmoid approximation epsilon = 1e-3 # Threshold for "near zero" l0_penalty_weight = 1e-1 # Adjust this hyperparameter - smoothed_l0 = torch.sigmoid((weights - epsilon) / (epsilon * 0.1)).mean() - + smoothed_l0 = torch.sigmoid( + (weights - epsilon) / (epsilon * 0.1) + ).mean() + # Option 2: Log-sum penalty (smoother) # smoothed_l0 = torch.log(1 + actual_weights / epsilon).sum() / len(actual_weights) - + # Option 3: Exponential penalty # smoothed_l0 = (1 - torch.exp(-actual_weights / epsilon)).mean() diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 2b122fec..186a7673 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -32,7 +32,9 @@ def create_calibration_log_file(file_path): df["rel_abs_error"] = df["abs_error"] / df["target"].abs() df["loss"] = (df["rel_error"] ** 2).mean() - df.to_csv(str(file_path).replace(".h5", "_calibration_log.csv"), index=False) + df.to_csv( + str(file_path).replace(".h5", "_calibration_log.csv"), index=False + ) def minimise_dataset( From bdf3d6d89d16ac396786899ce3e3233c0c46ceb4 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 11 Jul 2025 16:24:22 +0200 Subject: [PATCH 05/56] attempting to vectorize minimizing of ecps --- changelog_entry.yaml | 4 + .../datasets/cps/enhanced_cps.py | 27 +++--- policyengine_us_data/utils/minimise.py | 83 ++++++++++++++++--- 3 files changed, 91 insertions(+), 23 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..84eeb584 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Enhanced CPS minimizing tests. \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 7d81a0c0..bf303f7a 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -45,8 +45,8 @@ def reweight( np.log(original_weights), requires_grad=True, dtype=torch.float32 ) - # TODO: replace this with a call to the python reweight.py package. - def loss(weights): + # TO DO: replace this with a call to the python reweight.py package. + def loss(weights, penalty_approach="l0_sigmoid"): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -60,25 +60,32 @@ def loss(weights): ) ** 2 rel_error_normalized = rel_error * normalisation_factor + if torch.isnan(rel_error_normalized).any(): + raise ValueError("Relative error contains NaNs") + # L0 penalty (approximated with smooth function) # Since L0 is non-differentiable, we use a smooth approximation # Common approaches: - # Option 1: Sigmoid approximation epsilon = 1e-3 # Threshold for "near zero" l0_penalty_weight = 1e-1 # Adjust this hyperparameter - smoothed_l0 = torch.sigmoid( - (weights - epsilon) / (epsilon * 0.1) - ).mean() + + # Option 1: Sigmoid approximation + if penalty_approach == "l0_sigmoid": + smoothed_l0 = torch.sigmoid( + (weights - epsilon) / (epsilon * 0.1) + ).mean() # Option 2: Log-sum penalty (smoother) - # smoothed_l0 = torch.log(1 + actual_weights / epsilon).sum() / len(actual_weights) + if penalty_approach == "l0_log": + smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights) # Option 3: Exponential penalty - # smoothed_l0 = (1 - torch.exp(-actual_weights / epsilon)).mean() + if penalty_approach == "l0_exp": + smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() + + # L1 penalty - if torch.isnan(rel_error_normalized).any(): - raise ValueError("Relative error contains NaNs") return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 def dropout_weights(weights, p): diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 186a7673..94601d02 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -5,6 +5,7 @@ import pandas as pd import h5py from policyengine_us_data.storage import STORAGE_FOLDER +from typing import Optional def create_calibration_log_file(file_path): @@ -37,6 +38,57 @@ def create_calibration_log_file(file_path): ) +def losses_for_candidates( + base_weights: np.ndarray, + idxs: np.ndarray, + est_mat: np.ndarray, + targets: np.ndarray, + norm: np.ndarray, + chunk_size: Optional[int] = 25_000, +) -> np.ndarray: + """ + Return the loss value *for each* candidate deletion in `idxs` + in one matrix multiplication. + + Parameters + ---------- + base_weights : (n,) original weight vector + idxs : (k,) candidate row indices to zero-out + est_mat : (n, m) estimate matrix + targets : (m,) calibration targets + norm : (m,) normalisation factors + chunk_size : max number of candidates to process at once + + Returns + ------- + losses : (k,) loss if row i were removed (and weights rescaled) + """ + W = base_weights + total = W.sum() + k = len(idxs) + losses = np.empty(k, dtype=float) + + # Work through the candidate list in blocks + for start in range(0, k, chunk_size): + stop = min(start + chunk_size, k) + part = idxs[start:stop] # (p,) where p ≤ chunk_size + p = len(part) + + # Build the delta matrix only for this chunk + delta = np.zeros((p, len(W))) + delta[np.arange(p), part] = -W[part] + + keep_total = total + delta.sum(axis=1) # (p,) + delta *= (total / keep_total)[:, None] + + # Matrix–matrix multiply → one matrix multiplication per chunk + ests = (W + delta) @ est_mat # (p, m) + rel_err = ((ests - targets) + 1) / (targets + 1) + losses[start:stop] = ((rel_err * norm) ** 2).mean(axis=1) + + return losses + + def minimise_dataset( dataset, output_path: str, loss_rel_change_max: float ) -> None: @@ -95,19 +147,24 @@ def get_loss_from_mask( size=int(len(weights) * VIEW_FRACTION_PER_ITERATION), replace=False, ) - for household_index in tqdm(indices): - # Skip if this household is already excluded - if not inclusion_mask[household_index]: - household_loss_rel_changes.append(np.inf) - continue - # Calculate loss if this household is removed - inclusion_mask = inclusion_mask.copy() - inclusion_mask[household_index] = False - loss = get_loss_from_mask( - inclusion_mask, estimate_matrix, targets, normalisation_factor - ) - rel_change = (loss - baseline_loss) / baseline_loss - household_loss_rel_changes.append(rel_change) + + # more efficient approach to compute losses for candidate households to be removed + + # 1. sample only households that are currently *included* + indices = np.random.choice( + np.where(full_mask)[0], + size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION), + replace=False, + ) + # 2. compute losses for the batch in one shot + candidate_losses = losses_for_candidates( + weights, indices, estimate_matrix, targets, normalisation_factor + ) + # 3. convert to relative change vs. baseline + household_loss_rel_changes = ( + candidate_losses - baseline_loss + ) / baseline_loss + inclusion_mask = full_mask.copy() household_loss_rel_changes = np.array(household_loss_rel_changes) # Sort by the relative change in loss From 03e5d0d380494b698cbcb4af14b5c8eb256754d0 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 11 Jul 2025 16:52:43 +0200 Subject: [PATCH 06/56] adding random sampling minimization strategy --- policyengine_us_data/utils/minimise.py | 240 ++++++++++++++++++------- 1 file changed, 173 insertions(+), 67 deletions(-) diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 94601d02..45212905 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -5,7 +5,7 @@ import pandas as pd import h5py from policyengine_us_data.storage import STORAGE_FOLDER -from typing import Optional +from typing import Optional, Callable def create_calibration_log_file(file_path): @@ -89,116 +89,214 @@ def losses_for_candidates( return losses -def minimise_dataset( - dataset, output_path: str, loss_rel_change_max: float -) -> None: - dataset = str(dataset) - create_calibration_log_file(dataset) +def get_loss_from_mask( + weights, inclusion_mask, estimate_matrix, targets, normalisation_factor +): + """ + Calculate the loss based on the inclusion mask and the estimate matrix. + """ + masked_weights = weights.copy() + original_weight_total = masked_weights.sum() + if (~inclusion_mask).sum() > 0: + masked_weights[~inclusion_mask] = 0 + masked_weight_total = masked_weights.sum() + masked_weights[inclusion_mask] *= ( + original_weight_total / masked_weight_total + ) + estimates = masked_weights @ estimate_matrix + rel_error = ((estimates - targets) + 1) / (targets + 1) + loss = ((rel_error * normalisation_factor) ** 2).mean() - dataset = Dataset.from_file(dataset) - loss_matrix = build_loss_matrix(dataset, 2024) + return loss - sim = Microsimulation(dataset=dataset) - weights = sim.calculate("household_weight", 2024).values - estimate_matrix, targets = loss_matrix - is_national = estimate_matrix.columns.str.startswith("nation/") - nation_normalisation_factor = is_national * (1 / is_national.sum()) - state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) - normalisation_factor = np.where( - is_national, nation_normalisation_factor, state_normalisation_factor - ) - weights @ estimate_matrix - - def get_loss_from_mask( - inclusion_mask, estimate_matrix, targets, normalisation_factor - ): - """ - Calculate the loss based on the inclusion mask and the estimate matrix. - """ - masked_weights = weights.copy() - original_weight_total = masked_weights.sum() - if (~inclusion_mask).sum() > 0: - masked_weights[~inclusion_mask] = 0 - masked_weight_total = masked_weights.sum() - masked_weights[inclusion_mask] *= ( - original_weight_total / masked_weight_total - ) - estimates = masked_weights @ estimate_matrix - rel_error = ((estimates - targets) + 1) / (targets + 1) - loss = ((rel_error * normalisation_factor) ** 2).mean() +def candidate_loss_contribution( + weights: np.ndarray, + estimate_matrix: np.ndarray, + targets: np.ndarray, + normalisation_factor: np.ndarray, + loss_rel_change_max: float, + count_iterations: int = 5, + view_fraction_per_iteration: float = 0.3, + fraction_remove_per_iteration: float = 0.1, +) -> np.ndarray: + """ + Minimization approach based on candidate loss contribution. + + This function iteratively removes households that contribute least to the loss, + maintaining the calibration quality within the specified tolerance. - return loss + Parameters + ---------- + weights : (n,) household weights + estimate_matrix : (n, m) matrix mapping weights to estimates + targets : (m,) calibration targets + normalisation_factor : (m,) normalisation factors for different targets + loss_rel_change_max : maximum allowed relative change in loss + count_iterations : number of iterations to perform + view_fraction_per_iteration : fraction of households to evaluate each iteration + fraction_remove_per_iteration : fraction of households to remove each iteration - COUNT_ITERATIONS = 5 - VIEW_FRACTION_PER_ITERATION = 0.3 - FRACTION_REMOVE_PER_ITERATION = 0.1 + Returns + ------- + inclusion_mask : (n,) boolean mask of households to keep + """ from tqdm import tqdm full_mask = np.ones_like(weights, dtype=bool) - for i in range(COUNT_ITERATIONS): + + for i in range(count_iterations): inclusion_mask = full_mask.copy() baseline_loss = get_loss_from_mask( - inclusion_mask, estimate_matrix, targets, normalisation_factor + weights, + inclusion_mask, + estimate_matrix, + targets, + normalisation_factor, ) - household_loss_rel_changes = [] - indices = np.random.choice( - np.arange(len(weights)), - size=int(len(weights) * VIEW_FRACTION_PER_ITERATION), - replace=False, - ) - - # more efficient approach to compute losses for candidate households to be removed - # 1. sample only households that are currently *included* + # Sample only households that are currently included indices = np.random.choice( np.where(full_mask)[0], - size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION), + size=int(full_mask.sum() * view_fraction_per_iteration), replace=False, ) - # 2. compute losses for the batch in one shot + + # Compute losses for the batch in one shot candidate_losses = losses_for_candidates( weights, indices, estimate_matrix, targets, normalisation_factor ) - # 3. convert to relative change vs. baseline + + # Convert to relative change vs. baseline household_loss_rel_changes = ( candidate_losses - baseline_loss ) / baseline_loss - inclusion_mask = full_mask.copy() - household_loss_rel_changes = np.array(household_loss_rel_changes) # Sort by the relative change in loss sorted_indices = np.argsort(household_loss_rel_changes) + # Remove the worst households - num_to_remove = int(len(weights) * FRACTION_REMOVE_PER_ITERATION) - worst_indices = sorted_indices[:num_to_remove] + num_to_remove = int(len(weights) * fraction_remove_per_iteration) + worst_indices = indices[sorted_indices[:num_to_remove]] inclusion_mask[worst_indices] = False + # Calculate the new loss new_loss = get_loss_from_mask( - inclusion_mask, estimate_matrix, targets, normalisation_factor + weights, + inclusion_mask, + estimate_matrix, + targets, + normalisation_factor, ) rel_change = (new_loss - baseline_loss) / baseline_loss + if rel_change > loss_rel_change_max: print( - f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, which is too high ({rel_change:.2%}). Stopping." + f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, " + f"which is too high ({rel_change:.2%}). Stopping." ) break + print( f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}" ) print( f"Removed {num_to_remove} households with worst relative loss changes." ) + # Update the full mask full_mask &= inclusion_mask - household_ids = sim.calculate("household_id", 2024).values - remaining_households = household_ids[full_mask] + return full_mask + + +def random_sampling_minimization( + weights, + estimate_matrix, + targets, + normalisation_factor, + target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5], +): + """A simple random sampling approach""" + n = len(weights) + + final_mask = None + lowest_loss = float("inf") + for fraction in target_fractions: + target_size = int(n * fraction) + # Random sampling with multiple attempts + best_mask = None + best_loss = float("inf") + + for _ in range(5): # Try 5 random samples + mask = np.zeros(n, dtype=bool) + mask[np.random.choice(n, target_size, replace=False)] = True + + loss = get_loss_from_mask( + weights, mask, estimate_matrix, targets, normalisation_factor + ) + + if loss < best_loss: + best_loss = loss + best_mask = mask + + if lowest_loss > best_loss: + lowest_loss = best_loss + final_mask = best_mask + + return final_mask + + +def minimise_dataset( + dataset, + output_path: str, + loss_rel_change_max: float, + minimization_function: Callable = candidate_loss_contribution, + **kwargs, +) -> None: + """ + Main function to minimize a dataset using a specified minimization approach. + + Parameters + ---------- + dataset : path to the dataset file or Dataset object + output_path : path where the minimized dataset will be saved + loss_rel_change_max : maximum allowed relative change in loss + minimization_function : function that implements the minimization logic + **kwargs : additional arguments to pass to the minimization function + """ + dataset = str(dataset) + create_calibration_log_file(dataset) + + dataset = Dataset.from_file(dataset) + loss_matrix = build_loss_matrix(dataset, 2024) - # At this point we have a mask of households to keep + sim = Microsimulation(dataset=dataset) - # I'm saving to a csv for ease of debugging, but we need to save to a .h5 file + weights = sim.calculate("household_weight", 2024).values + estimate_matrix, targets = loss_matrix + is_national = estimate_matrix.columns.str.startswith("nation/") + nation_normalisation_factor = is_national * (1 / is_national.sum()) + state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) + normalisation_factor = np.where( + is_national, nation_normalisation_factor, state_normalisation_factor + ) + + # Call the minimization function + inclusion_mask = minimization_function( + weights=weights, + estimate_matrix=estimate_matrix, + targets=targets, + normalisation_factor=normalisation_factor, + loss_rel_change_max=loss_rel_change_max, + **kwargs, + ) + + # Extract household IDs for remaining households + household_ids = sim.calculate("household_id", 2024).values + remaining_households = household_ids[inclusion_mask] + # Create a smaller dataset with only the remaining households df = sim.to_input_dataframe() smaller_df = df[df["household_id__2024"].isin(remaining_households)] @@ -208,27 +306,30 @@ def get_loss_from_mask( ) print(f"Weight relative change: {weight_rel_change:.2%}") + # Create new simulation with smaller dataset sim = Microsimulation(dataset=smaller_df) + # Rescale weights to maintain total sim.set_input( "household_weight", 2024, sim.calculate("household_weight", 2024).values / weight_rel_change, ) + # Prepare data for saving data = {} - for variable in sim.input_variables: data[variable] = {2024: sim.calculate(variable, 2024).values} if data[variable][2024].dtype == "object": data[variable][2024] = data[variable][2024].astype("S") + # Save to HDF5 file with h5py.File(output_path, "w") as f: for variable, values in data.items(): for year, value in values.items(): f.create_dataset(f"{variable}/{year}", data=value) - print(f"Saved minimised dataset to {output_path}") + print(f"Saved minimised dataset to {output_path}") create_calibration_log_file(output_path) @@ -240,4 +341,9 @@ def get_loss_from_mask( for file in files: output_path = file.with_name(file.stem + "_minimised.h5") - minimise_dataset(file, output_path, loss_rel_change_max=10) + minimise_dataset( + file, + output_path, + loss_rel_change_max=10, + minimization_function=candidate_loss_contribution, + ) From cd0776c0eb7d1745e987ace34ecc4b56306eee2b Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 11 Jul 2025 19:19:58 +0200 Subject: [PATCH 07/56] add notebook with testing functionality (havent tested locally) --- .../datasets/cps/enhanced_cps.py | 8 +- policyengine_us_data/utils/minimise.py | 2 +- test_minimization_approach.ipynb | 107 ++++++++++++++++++ 3 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 test_minimization_approach.ipynb diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index bf303f7a..08798622 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -28,6 +28,7 @@ def reweight( targets_array, dropout_rate=0.05, log_path="calibration_log.csv", + penalty_approach="l0_sigmoid", ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -46,7 +47,7 @@ def reweight( ) # TO DO: replace this with a call to the python reweight.py package. - def loss(weights, penalty_approach="l0_sigmoid"): + def loss(weights, penalty_approach=penalty_approach): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -85,6 +86,11 @@ def loss(weights, penalty_approach="l0_sigmoid"): smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() # L1 penalty + l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs + + if penalty_approach == "l1": + l1 = torch.mean(weights) + return rel_error_normalized.mean() + l1_penalty_weight * l1 return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 45212905..a9ba3959 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -330,7 +330,7 @@ def minimise_dataset( f.create_dataset(f"{variable}/{year}", data=value) print(f"Saved minimised dataset to {output_path}") - create_calibration_log_file(output_path) + create_calibration_log_file(output_path) if __name__ == "__main__": diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb new file mode 100644 index 00000000..519d2725 --- /dev/null +++ b/test_minimization_approach.ipynb @@ -0,0 +1,107 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "id": "d6dc9cca", + "metadata": {}, + "outputs": [], + "source": [ + "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n", + "from policyengine_us_data.storage import STORAGE_FOLDER\n", + "from policyengine_us import Microsimulation\n", + "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", + "from policyengine_us_data.utils import build_loss_matrix\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db975ac1", + "metadata": {}, + "outputs": [], + "source": [ + "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", + "\n", + "files = [\n", + " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", + " ]\n", + "\n", + "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n", + "minimization_function = random_sampling_minimization\n", + "# other minimization function approach is \"candidate_loss_contribution\"\n", + "\n", + "for file in files:\n", + " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + " minimise_dataset(\n", + " file,\n", + " output_path,\n", + " loss_rel_change_max=10,\n", + " minimization_function=minimization_function, \n", + " target_fractions=[0.5] # remove if switching approach\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35892c9d", + "metadata": {}, + "outputs": [], + "source": [ + "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", + "\n", + "input_dataset = ExtendedCPS_2024\n", + "\n", + "approach = \"l0_sigmoid\"\n", + "# other options are \"l0_log\", \"l0_exp\", \"l1\"\n", + "\n", + "sim = Microsimulation(dataset=input_dataset)\n", + "data = sim.dataset.load_dataset()\n", + "data[\"household_weight\"] = {}\n", + "original_weights = sim.calculate(\"household_weight\")\n", + "original_weights = original_weights.values + np.random.normal(\n", + " 1, 0.1, len(original_weights)\n", + ")\n", + "for year in range(2024, 2025):\n", + " loss_matrix, targets_array = build_loss_matrix(\n", + " input_dataset, year\n", + " )\n", + " optimised_weights = reweight(\n", + " original_weights,\n", + " loss_matrix,\n", + " targets_array,\n", + " log_path= STORAGE_FOLDER / approach / \"calibration_log.csv\",\n", + " penalty_approach=approach,\n", + " )\n", + " data[\"household_weight\"][year] = optimised_weights\n", + "\n", + "output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + "\n", + "data.save_dataset(output_path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pe", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 2c050fc973ba312d070c27dcb7f1fb049e1e2af2 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 11 Jul 2025 19:20:55 +0200 Subject: [PATCH 08/56] lint --- policyengine_us_data/utils/minimise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index a9ba3959..45212905 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -330,7 +330,7 @@ def minimise_dataset( f.create_dataset(f"{variable}/{year}", data=value) print(f"Saved minimised dataset to {output_path}") - create_calibration_log_file(output_path) + create_calibration_log_file(output_path) if __name__ == "__main__": From ee98fc36ab920d571982862dc48d950b7a58ec3d Mon Sep 17 00:00:00 2001 From: eccuraa Date: Fri, 11 Jul 2025 20:06:32 -0400 Subject: [PATCH 09/56] debugged 2nd cell: created path & removed optional parameters. --- policyengine_us_data/utils/minimise.py | 8 +- test_minimization_approach.ipynb | 219 +++++++++++++++++++++++-- 2 files changed, 210 insertions(+), 17 deletions(-) diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 45212905..e84e1bee 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -250,10 +250,11 @@ def random_sampling_minimization( def minimise_dataset( dataset, output_path: str, - loss_rel_change_max: float, minimization_function: Callable = candidate_loss_contribution, **kwargs, ) -> None: + #loss_rel_change_max = kwargs.pop('loss_rel_change_max', 10.0) + """ Main function to minimize a dataset using a specified minimization approach. @@ -288,8 +289,7 @@ def minimise_dataset( estimate_matrix=estimate_matrix, targets=targets, normalisation_factor=normalisation_factor, - loss_rel_change_max=loss_rel_change_max, - **kwargs, + **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. ) # Extract household IDs for remaining households @@ -344,6 +344,4 @@ def minimise_dataset( minimise_dataset( file, output_path, - loss_rel_change_max=10, - minimization_function=candidate_loss_contribution, ) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 519d2725..8400d4fe 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -12,15 +12,188 @@ "from policyengine_us import Microsimulation\n", "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", - "import numpy as np" + "import numpy as np\n", + "import os\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "db975ac1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n", + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n", + "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n", + "Removed 4131 households with worst relative loss changes.\n", + "Weight relative change: 52.19%\n", + "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n" + ] + } + ], "source": [ "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", "\n", @@ -28,27 +201,49 @@ " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", " ]\n", "\n", - "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n", - "minimization_function = random_sampling_minimization\n", - "# other minimization function approach is \"candidate_loss_contribution\"\n", + "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n", + "minimization_function = candidate_loss_contribution\n", + "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n", "\n", "for file in files:\n", " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + "\n", " minimise_dataset(\n", " file,\n", " output_path,\n", " loss_rel_change_max=10,\n", " minimization_function=minimization_function, \n", - " target_fractions=[0.5] # remove if switching approach\n", + " #target_fractions=[0.5] # remove if switching approach\n", " )" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "35892c9d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ValueError", + "evalue": "Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m approach \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ml0_sigmoid\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# other options are \"l0_log\", \"l0_exp\", \"l1\"\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_dataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 9\u001b[0m data \u001b[38;5;241m=\u001b[39m sim\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39mload_dataset()\n\u001b[1;32m 10\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {}\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 215\u001b[0m dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m 217\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m 218\u001b[0m ):\n\u001b[1;32m 219\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m 225\u001b[0m )\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m 180\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m 181\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 182\u001b[0m )\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m 187\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 188\u001b[0m )\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:103\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload()\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 103\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/extended_cps.py:147\u001b[0m, in \u001b[0;36mExtendedCPS.generate\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Microsimulation\n\u001b[1;32m 146\u001b[0m cps_sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcps)\n\u001b[0;32m--> 147\u001b[0m puf_sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpuf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 149\u001b[0m puf_sim\u001b[38;5;241m.\u001b[39msubsample(\u001b[38;5;241m10_000\u001b[39m)\n\u001b[1;32m 151\u001b[0m INPUTS \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 152\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mage\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 153\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_male\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_tax_unit_dependent\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 159\u001b[0m ]\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 215\u001b[0m dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m 217\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m 218\u001b[0m ):\n\u001b[1;32m 219\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m 225\u001b[0m )\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m 180\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m 181\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 182\u001b[0m )\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m 187\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 188\u001b[0m )\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:101\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexists \u001b[38;5;129;01mand\u001b[39;00m require:\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39murl \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgenerate()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:343\u001b[0m, in \u001b[0;36mDataset.download\u001b[0;34m(self, url, version)\u001b[0m\n\u001b[1;32m 341\u001b[0m response \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mget(url, headers\u001b[38;5;241m=\u001b[39mauth_headers)\n\u001b[1;32m 342\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[0;32m--> 343\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 344\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid response code \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for url \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 345\u001b[0m )\n\u001b[1;32m 346\u001b[0m assets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massets\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 347\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m asset \u001b[38;5;129;01min\u001b[39;00m assets:\n", + "\u001b[0;31mValueError\u001b[0m: Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0." + ] + } + ], "source": [ "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", "\n", @@ -85,7 +280,7 @@ ], "metadata": { "kernelspec": { - "display_name": "pe", + "display_name": "policyengine-us-data", "language": "python", "name": "python3" }, @@ -99,7 +294,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.11" + "version": "3.11.13" } }, "nbformat": 4, From f6d7f0fa00f158f099c2dc15116fac4987d33085 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Mon, 14 Jul 2025 15:22:58 +0200 Subject: [PATCH 10/56] few updates to the testing framework --- changelog_entry.yaml | 2 +- .../datasets/cps/enhanced_cps.py | 78 +++++++++++++------ policyengine_us_data/utils/minimise.py | 75 +++++++++++++----- pyproject.toml | 4 +- test_minimization_approach.ipynb | 75 +++++++++--------- 5 files changed, 149 insertions(+), 85 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 84eeb584..ac664753 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,4 @@ - bump: minor changes: added: - - Enhanced CPS minimizing tests. \ No newline at end of file + - Enhanced CPS minimizing tests. \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 08798622..6ad510f3 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -22,13 +22,25 @@ torch = None +bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", +] + + def reweight( original_weights, loss_matrix, targets_array, dropout_rate=0.05, log_path="calibration_log.csv", - penalty_approach="l0_sigmoid", + penalty_approach=None, ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -64,35 +76,43 @@ def loss(weights, penalty_approach=penalty_approach): if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") - # L0 penalty (approximated with smooth function) - # Since L0 is non-differentiable, we use a smooth approximation - # Common approaches: + if penalty_approach is not None: + # L0 penalty (approximated with smooth function) + # Since L0 is non-differentiable, we use a smooth approximation + # Common approaches: + + epsilon = 1e-3 # Threshold for "near zero" + l0_penalty_weight = 1e-1 # Adjust this hyperparameter - epsilon = 1e-3 # Threshold for "near zero" - l0_penalty_weight = 1e-1 # Adjust this hyperparameter + # Option 1: Sigmoid approximation + if penalty_approach == "l0_sigmoid": + smoothed_l0 = torch.sigmoid( + (weights - epsilon) / (epsilon * 0.1) + ).mean() - # Option 1: Sigmoid approximation - if penalty_approach == "l0_sigmoid": - smoothed_l0 = torch.sigmoid( - (weights - epsilon) / (epsilon * 0.1) - ).mean() + # Option 2: Log-sum penalty (smoother) + if penalty_approach == "l0_log": + smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len( + weights + ) - # Option 2: Log-sum penalty (smoother) - if penalty_approach == "l0_log": - smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights) + # Option 3: Exponential penalty + if penalty_approach == "l0_exp": + smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() - # Option 3: Exponential penalty - if penalty_approach == "l0_exp": - smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() + # L1 penalty + l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs - # L1 penalty - l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs + if penalty_approach == "l1": + l1 = torch.mean(weights) + return rel_error_normalized.mean() + l1_penalty_weight * l1 - if penalty_approach == "l1": - l1 = torch.mean(weights) - return rel_error_normalized.mean() + l1_penalty_weight * l1 + return ( + rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 + ) - return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 + else: + return rel_error_normalized.mean() def dropout_weights(weights, p): if p == 0: @@ -213,10 +233,18 @@ def generate(self): loss_matrix, targets_array = build_loss_matrix( self.input_dataset, year ) + + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_array_clean = targets_array[keep_idx] + assert loss_matrix_clean.shape[1] == targets_array_clean.size + optimised_weights = reweight( original_weights, - loss_matrix, - targets_array, + loss_matrix_clean, + targets_array_clean, log_path="calibration_log.csv", ) data["household_weight"][year] = optimised_weights diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index e84e1bee..df193c6e 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -7,30 +7,53 @@ from policyengine_us_data.storage import STORAGE_FOLDER from typing import Optional, Callable - -def create_calibration_log_file(file_path): +bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", +] + + +def create_calibration_log_file(file_path, epoch=0): dataset = Dataset.from_file(file_path) - loss_matrix = build_loss_matrix(dataset, 2024) + loss_matrix, targets = build_loss_matrix(dataset, 2024) + + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + assert loss_matrix_clean.shape[1] == targets_clean.size sim = Microsimulation(dataset=dataset) - estimates = sim.calculate("household_weight", 2024).values @ loss_matrix[0] - target_names = loss_matrix[0].columns - target_values = loss_matrix[1] + estimates = ( + sim.calculate("household_weight", 2024).values @ loss_matrix_clean + ) + target_names = loss_matrix_clean.columns df = pd.DataFrame( { "target_name": target_names, "estimate": estimates, - "target": target_values, + "target": targets_clean, } ) - df["epoch"] = 0 + df["epoch"] = epoch df["error"] = df["estimate"] - df["target"] df["rel_error"] = df["error"] / df["target"] df["abs_error"] = df["error"].abs() - df["rel_abs_error"] = df["abs_error"] / df["target"].abs() + df["rel_abs_error"] = ( + df["abs_error"] / df["target"].abs() + if df["target"].abs().sum() > 0 + else np.nan + ) df["loss"] = (df["rel_error"] ** 2).mean() df.to_csv( @@ -215,11 +238,14 @@ def random_sampling_minimization( estimate_matrix, targets, normalisation_factor, + random=True, target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5], ): """A simple random sampling approach""" n = len(weights) + household_weights_normalized = weights / weights.sum() + final_mask = None lowest_loss = float("inf") for fraction in target_fractions: @@ -230,7 +256,14 @@ def random_sampling_minimization( for _ in range(5): # Try 5 random samples mask = np.zeros(n, dtype=bool) - mask[np.random.choice(n, target_size, replace=False)] = True + mask[ + np.random.choice( + n, + target_size, + p=household_weights_normalized if random else None, + replace=False, + ) + ] = True loss = get_loss_from_mask( weights, mask, estimate_matrix, targets, normalisation_factor @@ -253,8 +286,6 @@ def minimise_dataset( minimization_function: Callable = candidate_loss_contribution, **kwargs, ) -> None: - #loss_rel_change_max = kwargs.pop('loss_rel_change_max', 10.0) - """ Main function to minimize a dataset using a specified minimization approach. @@ -270,13 +301,19 @@ def minimise_dataset( create_calibration_log_file(dataset) dataset = Dataset.from_file(dataset) - loss_matrix = build_loss_matrix(dataset, 2024) + loss_matrix, targets = build_loss_matrix(dataset, 2024) + + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + assert loss_matrix_clean.shape[1] == targets_clean.size sim = Microsimulation(dataset=dataset) weights = sim.calculate("household_weight", 2024).values - estimate_matrix, targets = loss_matrix - is_national = estimate_matrix.columns.str.startswith("nation/") + is_national = loss_matrix_clean.columns.str.startswith("nation/") nation_normalisation_factor = is_national * (1 / is_national.sum()) state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) normalisation_factor = np.where( @@ -286,10 +323,10 @@ def minimise_dataset( # Call the minimization function inclusion_mask = minimization_function( weights=weights, - estimate_matrix=estimate_matrix, - targets=targets, + estimate_matrix=loss_matrix_clean, + targets=targets_clean, normalisation_factor=normalisation_factor, - **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. + **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. ) # Extract household IDs for remaining households @@ -330,7 +367,7 @@ def minimise_dataset( f.create_dataset(f"{variable}/{year}", data=value) print(f"Saved minimised dataset to {output_path}") - create_calibration_log_file(output_path) + create_calibration_log_file(output_path, epoch=500) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 0352db69..65d1ca8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us>=1.197.0", - "policyengine-core>=3.14.1", + "policyengine-us>=1.340.0", + "policyengine-core>=3.17.1", "requests", "tqdm", "microdf_python>=0.4.3", diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 8400d4fe..54f3c6fa 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -13,12 +13,24 @@ "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", "import numpy as np\n", - "import os\n" + "import os" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, + "id": "6daabe7c", + "metadata": {}, + "outputs": [], + "source": [ + "# Original ECPS 2024 dataset size (for household entity): 41310\n", + "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n", + "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "id": "db975ac1", "metadata": {}, "outputs": [ @@ -128,18 +140,17 @@ "Targeting Medicaid enrollment for WI with target 1108320k\n", "Targeting Medicaid enrollment for WV with target 467632k\n", "Targeting Medicaid enrollment for WY with target 57320k\n", - "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n", + "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n", + "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n", + "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n", + "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Weight relative change: 52.19%\n", - "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", + "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n", + "Weight relative change: 99.10%\n", + "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", "Targeting Medicaid enrollment for AK with target 231577k\n", "Targeting Medicaid enrollment for AL with target 766009k\n", "Targeting Medicaid enrollment for AR with target 733561k\n", @@ -203,7 +214,7 @@ "\n", "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n", "minimization_function = candidate_loss_contribution\n", - "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n", + "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", "\n", "for file in files:\n", " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", @@ -212,38 +223,18 @@ " minimise_dataset(\n", " file,\n", " output_path,\n", - " loss_rel_change_max=10,\n", " minimization_function=minimization_function, \n", - " #target_fractions=[0.5] # remove if switching approach\n", + " # target_fractions=[0.5] # remove if switching approach\n", + " loss_rel_change_max=0.0001, # remove if switching approach\n", " )" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "35892c9d", "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[5], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m approach \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ml0_sigmoid\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# other options are \"l0_log\", \"l0_exp\", \"l1\"\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_dataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 9\u001b[0m data \u001b[38;5;241m=\u001b[39m sim\u001b[38;5;241m.\u001b[39mdataset\u001b[38;5;241m.\u001b[39mload_dataset()\n\u001b[1;32m 10\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhousehold_weight\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m {}\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 215\u001b[0m dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m 217\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m 218\u001b[0m ):\n\u001b[1;32m 219\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m 225\u001b[0m )\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m 180\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m 181\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 182\u001b[0m )\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m 187\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 188\u001b[0m )\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:103\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload()\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 103\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/datasets/cps/extended_cps.py:147\u001b[0m, in \u001b[0;36mExtendedCPS.generate\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolicyengine_us\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Microsimulation\n\u001b[1;32m 146\u001b[0m cps_sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcps)\n\u001b[0;32m--> 147\u001b[0m puf_sim \u001b[38;5;241m=\u001b[39m \u001b[43mMicrosimulation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpuf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 149\u001b[0m puf_sim\u001b[38;5;241m.\u001b[39msubsample(\u001b[38;5;241m10_000\u001b[39m)\n\u001b[1;32m 151\u001b[0m INPUTS \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 152\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mage\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 153\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_male\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_tax_unit_dependent\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 159\u001b[0m ]\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/system.py:221\u001b[0m, in \u001b[0;36mMicrosimulation.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 215\u001b[0m dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m 217\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcps_2023\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\n\u001b[1;32m 218\u001b[0m ):\n\u001b[1;32m 219\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2023\u001b[39m\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 223\u001b[0m reform \u001b[38;5;241m=\u001b[39m create_structural_reforms_from_parameters(\n\u001b[1;32m 224\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mparameters, start_instant\n\u001b[1;32m 225\u001b[0m )\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m reform \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:184\u001b[0m, in \u001b[0;36mSimulation.__init__\u001b[0;34m(self, tax_benefit_system, populations, situation, dataset, reform, trace, default_input_period, default_calculation_period)\u001b[0m\n\u001b[1;32m 180\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(\n\u001b[1;32m 181\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 182\u001b[0m )\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, \u001b[38;5;28mtype\u001b[39m):\n\u001b[0;32m--> 184\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset: Dataset \u001b[38;5;241m=\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequire\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dataset, pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_dataframe(\n\u001b[1;32m 187\u001b[0m dataset, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_input_period\n\u001b[1;32m 188\u001b[0m )\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:101\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[0;34m(self, require)\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexists \u001b[38;5;129;01mand\u001b[39;00m require:\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39murl \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgenerate()\n", - "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/data/dataset.py:343\u001b[0m, in \u001b[0;36mDataset.download\u001b[0;34m(self, url, version)\u001b[0m\n\u001b[1;32m 341\u001b[0m response \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mget(url, headers\u001b[38;5;241m=\u001b[39mauth_headers)\n\u001b[1;32m 342\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[0;32m--> 343\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 344\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid response code \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for url \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 345\u001b[0m )\n\u001b[1;32m 346\u001b[0m assets \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124massets\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 347\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m asset \u001b[38;5;129;01min\u001b[39;00m assets:\n", - "\u001b[0;31mValueError\u001b[0m: Invalid response code 404 for url https://api.github.com/repos/policyengine/irs-soi-puf/releases/tags/1.8.0." - ] - } - ], + "outputs": [], "source": [ "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", "\n", @@ -276,11 +267,19 @@ "\n", "data.save_dataset(output_path)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4cf8e89", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "policyengine-us-data", + "display_name": "pe", "language": "python", "name": "python3" }, @@ -294,7 +293,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.13" + "version": "3.11.11" } }, "nbformat": 4, From a042a01f7826997d0ac99b330183b80cfee167df Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 13:44:20 -0400 Subject: [PATCH 11/56] added CPS_2023 to lite mode generation --- changelog_entry.yaml | 6 +++--- policyengine_us_data/datasets/cps/cps.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index ac664753..dcce3f1a 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,4 @@ -- bump: minor +- bump: patch changes: - added: - - Enhanced CPS minimizing tests. \ No newline at end of file + changed: + - lite mode now builds CPS_2023 in addition to CPS_2024 diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 3b976a31..fde981ba 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2006,6 +2006,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if __name__ == "__main__": if test_lite: + CPS_2023().generate() CPS_2024().generate() else: CPS_2021().generate() From cabeb56c7a1fe926eaf4c5aa5ecd26f45df3043f Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 14:54:23 -0400 Subject: [PATCH 12/56] Fixed manual test --- .github/workflows/code_changes.yaml | 1 + .github/workflows/manual_tests.yaml | 2 +- policyengine_us_data/datasets/cps/cps.py | 1 + pyproject.toml | 4 ++-- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index 6b474227..edd804db 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -2,6 +2,7 @@ name: Code changes on: + workflow_call: push: branches: - main diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml index a2daca18..fb13ba89 100644 --- a/.github/workflows/manual_tests.yaml +++ b/.github/workflows/manual_tests.yaml @@ -11,7 +11,7 @@ on: jobs: test: - uses: ./.github/workflows/pr_changelog.yaml + uses: ./.github/workflows/code_changes.yaml with: TEST_LITE: ${{ github.event.inputs.test_lite }} secrets: inherit diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index fde981ba..177f4707 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2008,6 +2008,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() + print(2 + 2) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index 65d1ca8e..3490ff1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us>=1.340.0", - "policyengine-core>=3.17.1", + "policyengine-us>=1.333.0", + "policyengine-core>=3.14.1", "requests", "tqdm", "microdf_python>=0.4.3", From 7b76afba9eb55c3d2588c1ba5c6683a48e3709f7 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 15:02:22 -0400 Subject: [PATCH 13/56] try again with locked version --- policyengine_us_data/datasets/cps/cps.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 177f4707..09a594c3 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2008,7 +2008,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 2) + print(2 + 3) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index 3490ff1b..74af05bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us>=1.333.0", + "policyengine-us==1.333.0", "policyengine-core>=3.14.1", "requests", "tqdm", From 4056df4762b5d5e98ff6da815eae8de1484a4c25 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 15:44:32 -0400 Subject: [PATCH 14/56] trying things --- policyengine_us_data/datasets/cps/cps.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 09a594c3..1edce6e9 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -100,9 +100,14 @@ def downsample(self, frac: float): original_dtypes = { key: original_data[key].dtype for key in original_data } - + print("\n\nHERE IS THE PROBLEM-----") + print(f"frac is {frac}") + print(self) + print(Microsimulation) sim = Microsimulation(dataset=self) - sim.subsample(frac=frac) + print(sim) + print(sim.subsample) + #sim.subsample(frac=frac) for key in original_data: if key not in sim.tax_benefit_system.variables: From 96c4c25b71b5e148059be66a28805ad41c8cc28b Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 15:45:47 -0400 Subject: [PATCH 15/56] lint --- policyengine_us_data/datasets/cps/cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 1edce6e9..30688719 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -107,7 +107,7 @@ def downsample(self, frac: float): sim = Microsimulation(dataset=self) print(sim) print(sim.subsample) - #sim.subsample(frac=frac) + # sim.subsample(frac=frac) for key in original_data: if key not in sim.tax_benefit_system.variables: From e20c75c202531e72fd118107c40fa10a0cda6e79 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:05:26 -0400 Subject: [PATCH 16/56] trying 3.11.12 --- policyengine_us_data/datasets/cps/cps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 30688719..8219e915 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -107,7 +107,7 @@ def downsample(self, frac: float): sim = Microsimulation(dataset=self) print(sim) print(sim.subsample) - # sim.subsample(frac=frac) + sim.subsample(frac=frac) for key in original_data: if key not in sim.tax_benefit_system.variables: @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 3) + print(2 + 5) else: CPS_2021().generate() CPS_2022().generate() From 776eda8ce513f7e1b845cb8212abd17301e46c73 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:10:26 -0400 Subject: [PATCH 17/56] now actually specifying py version --- policyengine_us_data/datasets/cps/cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 8219e915..a25aba26 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 5) + print(2 + 7) else: CPS_2021().generate() CPS_2022().generate() From cd771794473e0bb1f5005e7d6c598d8c1bc2a112 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:33:21 -0400 Subject: [PATCH 18/56] pandas v --- policyengine_us_data/datasets/cps/cps.py | 2 +- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index a25aba26..b3554604 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 7) + print(2 + 8) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index 74af05bf..6c767ede 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ requires-python = ">=3.11, <3.13.0" dependencies = [ "policyengine-us==1.333.0", "policyengine-core>=3.14.1", + "pandas==2.3.1", "requests", "tqdm", "microdf_python>=0.4.3", From d0ce44db56b066e4d370bc434fba08435f65e01f Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:47:12 -0400 Subject: [PATCH 19/56] small runner --- .github/workflows/pr_code_changes.yaml | 2 +- policyengine_us_data/datasets/cps/cps.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 213d192f..385e5a4c 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -49,7 +49,7 @@ jobs: run: python -c "from policyengine_core.data import Dataset; print('Core import OK')" Test: - runs-on: larger-runner + runs-on: ubuntu-latest needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index b3554604..027c2ef5 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 8) + print(2 + 0) else: CPS_2021().generate() CPS_2022().generate() From eb96cd5f706b0b718c39e36fa4fd1854bb3e3b0d Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 16:53:57 -0400 Subject: [PATCH 20/56] trying everything --- .github/workflows/pr_code_changes.yaml | 2 +- policyengine_us_data/datasets/cps/cps.py | 2 +- pyproject.toml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 385e5a4c..02209591 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -63,7 +63,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.11' + python-version: '3.11.12' - name: Install package run: uv pip install -e .[dev] --system diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 027c2ef5..afbf223f 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 0) + print(2 + 9) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index 6c767ede..d87290a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,11 +15,11 @@ authors = [ {name = "PolicyEngine", email = "hello@policyengine.org"}, ] license = {file = "LICENSE"} -requires-python = ">=3.11, <3.13.0" +requires-python = ">=3.11, <3.11.13" dependencies = [ "policyengine-us==1.333.0", "policyengine-core>=3.14.1", - "pandas==2.3.1", + "pandas==2.3.0", "requests", "tqdm", "microdf_python>=0.4.3", From 59ff94e82cd4dbd0aba16b488fd0b8ec16ca5531 Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 17:02:45 -0400 Subject: [PATCH 21/56] relaxing python version in pyproject.toml --- policyengine_us_data/datasets/cps/cps.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index afbf223f..3173d4d6 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2013,7 +2013,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(2 + 9) + print(3 + 9) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index d87290a2..fe5fda52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ authors = [ {name = "PolicyEngine", email = "hello@policyengine.org"}, ] license = {file = "LICENSE"} -requires-python = ">=3.11, <3.11.13" +requires-python = ">=3.11, <3.13.0" dependencies = [ "policyengine-us==1.333.0", "policyengine-core>=3.14.1", From d3fa67bf98762b48c6fe2397275c1d0aac2ff77b Mon Sep 17 00:00:00 2001 From: baogorek Date: Fri, 11 Jul 2025 17:29:12 -0400 Subject: [PATCH 22/56] putting things back in order. --- policyengine_us_data/datasets/cps/cps.py | 7 ------- pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 3173d4d6..d9957cbb 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -100,13 +100,7 @@ def downsample(self, frac: float): original_dtypes = { key: original_data[key].dtype for key in original_data } - print("\n\nHERE IS THE PROBLEM-----") - print(f"frac is {frac}") - print(self) - print(Microsimulation) sim = Microsimulation(dataset=self) - print(sim) - print(sim.subsample) sim.subsample(frac=frac) for key in original_data: @@ -2013,7 +2007,6 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(3 + 9) else: CPS_2021().generate() CPS_2022().generate() diff --git a/pyproject.toml b/pyproject.toml index fe5fda52..4bec19eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us==1.333.0", + "policyengine-us==1.340.1", "policyengine-core>=3.14.1", "pandas==2.3.0", "requests", From 273c48d7bc9db1d6f06fa859897b63c30d37b044 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Sat, 12 Jul 2025 13:01:15 +0100 Subject: [PATCH 23/56] Use normal runner in PR tests --- .github/workflows/pr_code_changes.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 02209591..c84a4b97 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -49,7 +49,7 @@ jobs: run: python -c "from policyengine_core.data import Dataset; print('Core import OK')" Test: - runs-on: ubuntu-latest + runs-on: ubuntu-latest needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} From 8c2fbda847e9945878afa4085476f56895c360f1 Mon Sep 17 00:00:00 2001 From: baogorek Date: Sat, 12 Jul 2025 09:53:07 -0400 Subject: [PATCH 24/56] added the 3.11.12 pin --- .github/workflows/code_changes.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index edd804db..c2340d14 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -38,7 +38,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.11' + python-version: '3.11.12' - uses: "google-github-actions/auth@v2" with: workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider" From edb09456bb8548b8b4eb94136122ab5a5b33586e Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 20:00:50 -0400 Subject: [PATCH 25/56] cps.py --- policyengine_us_data/datasets/cps/cps.py | 1 + 1 file changed, 1 insertion(+) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index d9957cbb..202f9c69 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2007,6 +2007,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() + print(3) else: CPS_2021().generate() CPS_2022().generate() From 994ac15a636b99f951e205ecb3a861e72cdc3472 Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 20:32:26 -0400 Subject: [PATCH 26/56] adding diagnostics --- .../datasets/cps/enhanced_cps.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 6ad510f3..17d3e862 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -249,6 +249,23 @@ def generate(self): ) data["household_weight"][year] = optimised_weights + print("\n\n---reweighting quick diagnostics----\n") + estimate = optimised_weights @ loss_matrix + rel_error = ( + ((estimate - targets_array) + 1) / (targets_array + 1) + ) ** 2 + print( + f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}", + f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}" + ) + print("Relative error over 100% for:") + for i in np.where(rel_error > 1)[0]: + print(f"target_name: {loss_matrix.columns[i]}") + print(f"target_value: {targets_array[i]}") + print(f"estimate_value: {estimate[i]}") + print(f"has rel_error: {rel_error.values[i]:.2f}\n") + print("---End of reweighting quick diagnostics------") + self.save_dataset(data) From 341a3559f4368f65947db8f0ebe4db67e39a671c Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 20:32:47 -0400 Subject: [PATCH 27/56] lint --- policyengine_us_data/datasets/cps/enhanced_cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 17d3e862..0da67ceb 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -256,7 +256,7 @@ def generate(self): ) ** 2 print( f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}", - f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}" + f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}", ) print("Relative error over 100% for:") for i in np.where(rel_error > 1)[0]: From c2ab4b6466de68c8970ac859157bc941fc56287b Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 22:27:46 -0400 Subject: [PATCH 28/56] taking out bad targets --- policyengine_us_data/datasets/cps/cps.py | 1 - .../datasets/cps/enhanced_cps.py | 59 +++++++++++++++++-- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 202f9c69..d9957cbb 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -2007,7 +2007,6 @@ class Pooled_3_Year_CPS_2023(PooledCPS): if test_lite: CPS_2023().generate() CPS_2024().generate() - print(3) else: CPS_2021().generate() CPS_2022().generate() diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 0da67ceb..e7a57044 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -40,7 +40,7 @@ def reweight( targets_array, dropout_rate=0.05, log_path="calibration_log.csv", - penalty_approach=None, + epochs=150, ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -58,8 +58,8 @@ def reweight( np.log(original_weights), requires_grad=True, dtype=torch.float32 ) - # TO DO: replace this with a call to the python reweight.py package. - def loss(weights, penalty_approach=penalty_approach): + # TODO: replace this functionality from the microcalibrate package. + def loss(weights): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -129,7 +129,7 @@ def dropout_weights(weights, p): start_loss = None - iterator = trange(500) + iterator = trange(epochs) performance = pd.DataFrame() for i in iterator: optimizer.zero_grad() @@ -229,13 +229,37 @@ def generate(self): original_weights = original_weights.values + np.random.normal( 1, 0.1, len(original_weights) ) + + bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "state/RI/adjusted_gross_income/amount/-inf_1", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "state/RI/adjusted_gross_income/amount/-inf_1", + "target_name: nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", + ] + + # Run the optimization procedure to get (close to) minimum loss weights for year in range(self.start_year, self.end_year + 1): loss_matrix, targets_array = build_loss_matrix( self.input_dataset, year ) - + zero_mask = np.isclose(targets_array, 0.0, atol=0.1) bad_mask = loss_matrix.columns.isin(bad_targets) - keep_mask_bool = ~bad_mask + keep_mask_bool = ~(zero_mask | bad_mask) keep_idx = np.where(keep_mask_bool)[0] loss_matrix_clean = loss_matrix.iloc[:, keep_idx] targets_array_clean = targets_array[keep_idx] @@ -245,10 +269,33 @@ def generate(self): original_weights, loss_matrix_clean, targets_array_clean, + loss_matrix_clean, + targets_array_clean, log_path="calibration_log.csv", + epochs=150, ) data["household_weight"][year] = optimised_weights + print("\n\n---reweighting quick diagnostics----\n") + estimate = optimised_weights @ loss_matrix_clean + rel_error = ( + ((estimate - targets_array_clean) + 1) + / (targets_array_clean + 1) + ) ** 2 + print( + f"rel_error: min: {np.min(rel_error):.2f}, " + f"max: {np.max(rel_error):.2f} " + f"mean: {np.mean(rel_error):.2f}, " + f"median: {np.median(rel_error):.2f}" + ) + print("Relative error over 100% for:") + for i in np.where(rel_error > 1)[0]: + print(f"target_name: {loss_matrix_clean.columns[i]}") + print(f"target_value: {targets_array_clean[i]}") + print(f"estimate_value: {estimate[i]}") + print(f"has rel_error: {rel_error[i]:.2f}\n") + print("---End of reweighting quick diagnostics------") + print("\n\n---reweighting quick diagnostics----\n") estimate = optimised_weights @ loss_matrix rel_error = ( From 6f7a03a76dc95d7f9ebfd20f1df6240bd11593bc Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:05:09 -0400 Subject: [PATCH 29/56] fixing workflow arg passthrough --- .github/workflows/pr_code_changes.yaml | 16 +++++++++++++--- changelog_entry.yaml | 6 ++++++ pyproject.toml | 4 ++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index c84a4b97..56224a2e 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -11,6 +11,14 @@ on: - tests/** - .github/workflows/** + workflow_call: + inputs: + TEST_LITE: + description: 'Run in lite mode' + type: boolean + required: false + default: false + jobs: Lint: runs-on: ubuntu-latest @@ -53,6 +61,7 @@ jobs: needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + TEST_LITE: ${{ inputs.TEST_LITE }} steps: - name: Checkout repo uses: actions/checkout@v2 @@ -63,7 +72,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.11.12' + python-version: '3.11' - name: Install package run: uv pip install -e .[dev] --system @@ -75,8 +84,9 @@ jobs: - name: Build datasets run: make data env: - TEST_LITE: true - PYTHON_LOG_LEVEL: INFO + TEST_LITE: ${{ env.TEST_LITE }} + PYTHON_LOG_LEVEL: INFO + - name: Save calibration log uses: actions/upload-artifact@v4 with: diff --git a/changelog_entry.yaml b/changelog_entry.yaml index dcce3f1a..bce8b349 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,10 @@ - bump: patch changes: changed: + - bad targets (causing problems with estimation) removed - lite mode now builds CPS_2023 in addition to CPS_2024 + - gave reweight an epochs argument and set it at 150 for optimization + - updating minimum versions on policyengine-us and pandas dependencies + fixed: + - manual workflow now can call PR code changes + diff --git a/pyproject.toml b/pyproject.toml index 4bec19eb..481cbc37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,9 +17,9 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us==1.340.1", + "policyengine-us>=1.340.1", "policyengine-core>=3.14.1", - "pandas==2.3.0", + "pandas>=2.3.0", "requests", "tqdm", "microdf_python>=0.4.3", From 3dba2a2aa3a578aeaa7e7acde71e53d150669036 Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:09:32 -0400 Subject: [PATCH 30/56] deps and defaults --- .github/workflows/code_changes.yaml | 2 +- .github/workflows/pr_code_changes.yaml | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index c2340d14..edd804db 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -38,7 +38,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.11.12' + python-version: '3.11' - uses: "google-github-actions/auth@v2" with: workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider" diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 56224a2e..1e05b564 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -17,7 +17,7 @@ on: description: 'Run in lite mode' type: boolean required: false - default: false + default: true jobs: Lint: diff --git a/pyproject.toml b/pyproject.toml index 481cbc37..f983258d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ "policyengine-us>=1.340.1", - "policyengine-core>=3.14.1", + "policyengine-core>=3.17.1", "pandas>=2.3.0", "requests", "tqdm", From 7710a4cd0f58de7b2120f146228977e9c46f253d Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:12:21 -0400 Subject: [PATCH 31/56] wrong pipeline for manual test --- .github/workflows/manual_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml index fb13ba89..fd6fa061 100644 --- a/.github/workflows/manual_tests.yaml +++ b/.github/workflows/manual_tests.yaml @@ -11,7 +11,7 @@ on: jobs: test: - uses: ./.github/workflows/code_changes.yaml + uses: ./.github/workflows/pr_code_changes.yaml with: TEST_LITE: ${{ github.event.inputs.test_lite }} secrets: inherit From 27f46fd8d19199fad6006675bcab231da67968af Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:30:46 -0400 Subject: [PATCH 32/56] trying again to get the manual test to work --- .github/workflows/manual_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml index fd6fa061..55667dbc 100644 --- a/.github/workflows/manual_tests.yaml +++ b/.github/workflows/manual_tests.yaml @@ -13,5 +13,5 @@ jobs: test: uses: ./.github/workflows/pr_code_changes.yaml with: - TEST_LITE: ${{ github.event.inputs.test_lite }} + TEST_LITE: ${{ inputs.test_lite }} secrets: inherit From fef1eca57d99d8359f335ac4886eebde5b45c6c9 Mon Sep 17 00:00:00 2001 From: baogorek Date: Sun, 13 Jul 2025 23:53:27 -0400 Subject: [PATCH 33/56] reverting to older workflow code --- .github/workflows/manual_tests.yaml | 17 ----------------- .github/workflows/pr_code_changes.yaml | 14 ++------------ changelog_entry.yaml | 4 +--- 3 files changed, 3 insertions(+), 32 deletions(-) delete mode 100644 .github/workflows/manual_tests.yaml diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml deleted file mode 100644 index 55667dbc..00000000 --- a/.github/workflows/manual_tests.yaml +++ /dev/null @@ -1,17 +0,0 @@ -name: Manual tests - -on: - workflow_dispatch: - inputs: - test_lite: - description: 'Run in lite mode' - required: true - default: true - type: boolean - -jobs: - test: - uses: ./.github/workflows/pr_code_changes.yaml - with: - TEST_LITE: ${{ inputs.test_lite }} - secrets: inherit diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 1e05b564..4e30d089 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -11,14 +11,6 @@ on: - tests/** - .github/workflows/** - workflow_call: - inputs: - TEST_LITE: - description: 'Run in lite mode' - type: boolean - required: false - default: true - jobs: Lint: runs-on: ubuntu-latest @@ -61,7 +53,6 @@ jobs: needs: Lint env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - TEST_LITE: ${{ inputs.TEST_LITE }} steps: - name: Checkout repo uses: actions/checkout@v2 @@ -84,9 +75,8 @@ jobs: - name: Build datasets run: make data env: - TEST_LITE: ${{ env.TEST_LITE }} - PYTHON_LOG_LEVEL: INFO - + TEST_LITE: true + PYTHON_LOG_LEVEL: INFO - name: Save calibration log uses: actions/upload-artifact@v4 with: diff --git a/changelog_entry.yaml b/changelog_entry.yaml index bce8b349..3f9b8627 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -5,6 +5,4 @@ - lite mode now builds CPS_2023 in addition to CPS_2024 - gave reweight an epochs argument and set it at 150 for optimization - updating minimum versions on policyengine-us and pandas dependencies - fixed: - - manual workflow now can call PR code changes - + - getting rid of non-working manual workflow code From 5eb10501cd4e8f33925411de7f4574e3dec413f8 Mon Sep 17 00:00:00 2001 From: baogorek Date: Mon, 14 Jul 2025 00:12:37 -0400 Subject: [PATCH 34/56] cleaning up enhanced_cps.py --- .../datasets/cps/enhanced_cps.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index e7a57044..5c82d724 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -249,7 +249,7 @@ def generate(self): "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", "state/RI/adjusted_gross_income/amount/-inf_1", - "target_name: nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", + "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", ] # Run the optimization procedure to get (close to) minimum loss weights @@ -296,23 +296,6 @@ def generate(self): print(f"has rel_error: {rel_error[i]:.2f}\n") print("---End of reweighting quick diagnostics------") - print("\n\n---reweighting quick diagnostics----\n") - estimate = optimised_weights @ loss_matrix - rel_error = ( - ((estimate - targets_array) + 1) / (targets_array + 1) - ) ** 2 - print( - f"rel_error: min: {np.min(rel_error):.2f}, max: {np.max(rel_error):.2f}", - f"mean: {np.mean(rel_error):.2f}, median: {np.median(rel_error):.2f}", - ) - print("Relative error over 100% for:") - for i in np.where(rel_error > 1)[0]: - print(f"target_name: {loss_matrix.columns[i]}") - print(f"target_value: {targets_array[i]}") - print(f"estimate_value: {estimate[i]}") - print(f"has rel_error: {rel_error.values[i]:.2f}\n") - print("---End of reweighting quick diagnostics------") - self.save_dataset(data) From 1fb4318b21072a9c5dbd2824216be49655f0b9b2 Mon Sep 17 00:00:00 2001 From: MaxGhenis Date: Mon, 14 Jul 2025 15:33:13 +0000 Subject: [PATCH 35/56] Update package version --- CHANGELOG.md | 11 +++++++++++ changelog.yaml | 9 +++++++++ changelog_entry.yaml | 8 -------- pyproject.toml | 2 +- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6299d8fb..e355d4dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.37.1] - 2025-07-14 15:33:11 + +### Changed + +- bad targets (causing problems with estimation) removed +- lite mode now builds CPS_2023 in addition to CPS_2024 +- gave reweight an epochs argument and set it at 150 for optimization +- updating minimum versions on policyengine-us and pandas dependencies +- getting rid of non-working manual workflow code + ## [1.37.0] - 2025-07-09 14:58:33 ### Added @@ -520,6 +530,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +[1.37.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.37.0...1.37.1 [1.37.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.2...1.37.0 [1.36.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.1...1.36.2 [1.36.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.36.0...1.36.1 diff --git a/changelog.yaml b/changelog.yaml index 699b2430..af7cdf32 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -433,3 +433,12 @@ added: - Medicaid state level calibration targets. date: 2025-07-09 14:58:33 +- bump: patch + changes: + changed: + - bad targets (causing problems with estimation) removed + - lite mode now builds CPS_2023 in addition to CPS_2024 + - gave reweight an epochs argument and set it at 150 for optimization + - updating minimum versions on policyengine-us and pandas dependencies + - getting rid of non-working manual workflow code + date: 2025-07-14 15:33:11 diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 3f9b8627..e69de29b 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,8 +0,0 @@ -- bump: patch - changes: - changed: - - bad targets (causing problems with estimation) removed - - lite mode now builds CPS_2023 in addition to CPS_2024 - - gave reweight an epochs argument and set it at 150 for optimization - - updating minimum versions on policyengine-us and pandas dependencies - - getting rid of non-working manual workflow code diff --git a/pyproject.toml b/pyproject.toml index f983258d..5a75693f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "policyengine_us_data" -version = "1.37.0" +version = "1.37.1" description = "A package to create representative microdata for the US." readme = "README.md" authors = [ From a62328a6f47293f90e1e696d03b49b96c044321b Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 11 Jul 2025 16:24:22 +0200 Subject: [PATCH 36/56] attempting to vectorize minimizing of ecps --- changelog_entry.yaml | 4 ++ .../datasets/cps/enhanced_cps.py | 53 +++++++------------ policyengine_us_data/utils/minimise.py | 51 ++++++++++++------ 3 files changed, 59 insertions(+), 49 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..84eeb584 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Enhanced CPS minimizing tests. \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 5c82d724..6616d54c 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -58,8 +58,8 @@ def reweight( np.log(original_weights), requires_grad=True, dtype=torch.float32 ) - # TODO: replace this functionality from the microcalibrate package. - def loss(weights): + # TO DO: replace this with a call to the python reweight.py package. + def loss(weights, penalty_approach="l0_sigmoid"): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -76,43 +76,30 @@ def loss(weights): if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") - if penalty_approach is not None: - # L0 penalty (approximated with smooth function) - # Since L0 is non-differentiable, we use a smooth approximation - # Common approaches: + # L0 penalty (approximated with smooth function) + # Since L0 is non-differentiable, we use a smooth approximation + # Common approaches: - epsilon = 1e-3 # Threshold for "near zero" - l0_penalty_weight = 1e-1 # Adjust this hyperparameter + epsilon = 1e-3 # Threshold for "near zero" + l0_penalty_weight = 1e-1 # Adjust this hyperparameter - # Option 1: Sigmoid approximation - if penalty_approach == "l0_sigmoid": - smoothed_l0 = torch.sigmoid( - (weights - epsilon) / (epsilon * 0.1) - ).mean() + # Option 1: Sigmoid approximation + if penalty_approach == "l0_sigmoid": + smoothed_l0 = torch.sigmoid( + (weights - epsilon) / (epsilon * 0.1) + ).mean() - # Option 2: Log-sum penalty (smoother) - if penalty_approach == "l0_log": - smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len( - weights - ) + # Option 2: Log-sum penalty (smoother) + if penalty_approach == "l0_log": + smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights) - # Option 3: Exponential penalty - if penalty_approach == "l0_exp": - smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() + # Option 3: Exponential penalty + if penalty_approach == "l0_exp": + smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() - # L1 penalty - l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs + # L1 penalty - if penalty_approach == "l1": - l1 = torch.mean(weights) - return rel_error_normalized.mean() + l1_penalty_weight * l1 - - return ( - rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 - ) - - else: - return rel_error_normalized.mean() + return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 def dropout_weights(weights, p): if p == 0: diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index df193c6e..ca985378 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -5,21 +5,10 @@ import pandas as pd import h5py from policyengine_us_data.storage import STORAGE_FOLDER -from typing import Optional, Callable +from typing import Optional -bad_targets = [ - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", - "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", - "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", -] - -def create_calibration_log_file(file_path, epoch=0): +def create_calibration_log_file(file_path): dataset = Dataset.from_file(file_path) loss_matrix, targets = build_loss_matrix(dataset, 2024) @@ -112,6 +101,27 @@ def losses_for_candidates( return losses +def minimise_dataset( + dataset, output_path: str, loss_rel_change_max: float +) -> None: + dataset = str(dataset) + create_calibration_log_file(dataset) + + dataset = Dataset.from_file(dataset) + loss_matrix = build_loss_matrix(dataset, 2024) + + sim = Microsimulation(dataset=dataset) + + weights = sim.calculate("household_weight", 2024).values + estimate_matrix, targets = loss_matrix + is_national = estimate_matrix.columns.str.startswith("nation/") + nation_normalisation_factor = is_national * (1 / is_national.sum()) + state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) + normalisation_factor = np.where( + is_national, nation_normalisation_factor, state_normalisation_factor + ) + weights @ estimate_matrix + def get_loss_from_mask( weights, inclusion_mask, estimate_matrix, targets, normalisation_factor ): @@ -185,16 +195,25 @@ def candidate_loss_contribution( replace=False, ) - # Compute losses for the batch in one shot + # more efficient approach to compute losses for candidate households to be removed + + # 1. sample only households that are currently *included* + indices = np.random.choice( + np.where(full_mask)[0], + size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION), + replace=False, + ) + # 2. compute losses for the batch in one shot candidate_losses = losses_for_candidates( weights, indices, estimate_matrix, targets, normalisation_factor ) - - # Convert to relative change vs. baseline + # 3. convert to relative change vs. baseline household_loss_rel_changes = ( candidate_losses - baseline_loss ) / baseline_loss + inclusion_mask = full_mask.copy() + household_loss_rel_changes = np.array(household_loss_rel_changes) # Sort by the relative change in loss sorted_indices = np.argsort(household_loss_rel_changes) From 6d3f8b4daea6ab498b105bf9429b74e52462cde4 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Fri, 11 Jul 2025 19:19:58 +0200 Subject: [PATCH 37/56] add notebook with testing functionality (havent tested locally) --- .../datasets/cps/enhanced_cps.py | 9 +- policyengine_us_data/utils/minimise.py | 2 +- test_minimization_approach.ipynb | 210 +----------------- 3 files changed, 16 insertions(+), 205 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 6616d54c..ca53a84d 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -40,7 +40,7 @@ def reweight( targets_array, dropout_rate=0.05, log_path="calibration_log.csv", - epochs=150, + penalty_approach="l0_sigmoid", ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -59,7 +59,7 @@ def reweight( ) # TO DO: replace this with a call to the python reweight.py package. - def loss(weights, penalty_approach="l0_sigmoid"): + def loss(weights, penalty_approach=penalty_approach): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -98,6 +98,11 @@ def loss(weights, penalty_approach="l0_sigmoid"): smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() # L1 penalty + l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs + + if penalty_approach == "l1": + l1 = torch.mean(weights) + return rel_error_normalized.mean() + l1_penalty_weight * l1 return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index ca985378..da2cb7d1 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -386,7 +386,7 @@ def minimise_dataset( f.create_dataset(f"{variable}/{year}", data=value) print(f"Saved minimised dataset to {output_path}") - create_calibration_log_file(output_path, epoch=500) + create_calibration_log_file(output_path) if __name__ == "__main__": diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 54f3c6fa..519d2725 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -12,199 +12,15 @@ "from policyengine_us import Microsimulation\n", "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", - "import numpy as np\n", - "import os" + "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, - "id": "6daabe7c", - "metadata": {}, - "outputs": [], - "source": [ - "# Original ECPS 2024 dataset size (for household entity): 41310\n", - "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n", - "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, "id": "db975ac1", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n", - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n", - "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n", - "Weight relative change: 99.10%\n", - "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n" - ] - } - ], + "outputs": [], "source": [ "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", "\n", @@ -212,20 +28,18 @@ " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", " ]\n", "\n", - "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n", - "minimization_function = candidate_loss_contribution\n", - "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", + "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n", + "minimization_function = random_sampling_minimization\n", + "# other minimization function approach is \"candidate_loss_contribution\"\n", "\n", "for file in files:\n", " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", - " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - "\n", " minimise_dataset(\n", " file,\n", " output_path,\n", + " loss_rel_change_max=10,\n", " minimization_function=minimization_function, \n", - " # target_fractions=[0.5] # remove if switching approach\n", - " loss_rel_change_max=0.0001, # remove if switching approach\n", + " target_fractions=[0.5] # remove if switching approach\n", " )" ] }, @@ -267,14 +81,6 @@ "\n", "data.save_dataset(output_path)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4cf8e89", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 94cacdeab007e318fe849bb3bbf4b29d7fcf627a Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Mon, 14 Jul 2025 15:22:58 +0200 Subject: [PATCH 38/56] few updates to the testing framework --- changelog_entry.yaml | 2 +- .../datasets/cps/enhanced_cps.py | 58 ++--- policyengine_us_data/utils/minimise.py | 59 +++++- pyproject.toml | 3 +- test_minimization_approach.ipynb | 198 +++++++++++++++++- 5 files changed, 280 insertions(+), 40 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 84eeb584..ac664753 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,4 @@ - bump: minor changes: added: - - Enhanced CPS minimizing tests. \ No newline at end of file + - Enhanced CPS minimizing tests. \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index ca53a84d..bf4b5501 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -40,7 +40,7 @@ def reweight( targets_array, dropout_rate=0.05, log_path="calibration_log.csv", - penalty_approach="l0_sigmoid", + penalty_approach=None, ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -76,35 +76,43 @@ def loss(weights, penalty_approach=penalty_approach): if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") - # L0 penalty (approximated with smooth function) - # Since L0 is non-differentiable, we use a smooth approximation - # Common approaches: + if penalty_approach is not None: + # L0 penalty (approximated with smooth function) + # Since L0 is non-differentiable, we use a smooth approximation + # Common approaches: - epsilon = 1e-3 # Threshold for "near zero" - l0_penalty_weight = 1e-1 # Adjust this hyperparameter + epsilon = 1e-3 # Threshold for "near zero" + l0_penalty_weight = 1e-1 # Adjust this hyperparameter - # Option 1: Sigmoid approximation - if penalty_approach == "l0_sigmoid": - smoothed_l0 = torch.sigmoid( - (weights - epsilon) / (epsilon * 0.1) - ).mean() + # Option 1: Sigmoid approximation + if penalty_approach == "l0_sigmoid": + smoothed_l0 = torch.sigmoid( + (weights - epsilon) / (epsilon * 0.1) + ).mean() - # Option 2: Log-sum penalty (smoother) - if penalty_approach == "l0_log": - smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len(weights) + # Option 2: Log-sum penalty (smoother) + if penalty_approach == "l0_log": + smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len( + weights + ) - # Option 3: Exponential penalty - if penalty_approach == "l0_exp": - smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() + # Option 3: Exponential penalty + if penalty_approach == "l0_exp": + smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() - # L1 penalty - l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs + # L1 penalty + l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs - if penalty_approach == "l1": - l1 = torch.mean(weights) - return rel_error_normalized.mean() + l1_penalty_weight * l1 + if penalty_approach == "l1": + l1 = torch.mean(weights) + return rel_error_normalized.mean() + l1_penalty_weight * l1 - return rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 + return ( + rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 + ) + + else: + return rel_error_normalized.mean() def dropout_weights(weights, p): if p == 0: @@ -249,9 +257,9 @@ def generate(self): loss_matrix, targets_array = build_loss_matrix( self.input_dataset, year ) - zero_mask = np.isclose(targets_array, 0.0, atol=0.1) + bad_mask = loss_matrix.columns.isin(bad_targets) - keep_mask_bool = ~(zero_mask | bad_mask) + keep_mask_bool = ~bad_mask keep_idx = np.where(keep_mask_bool)[0] loss_matrix_clean = loss_matrix.iloc[:, keep_idx] targets_array_clean = targets_array[keep_idx] diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index da2cb7d1..9c3d59eb 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -5,14 +5,33 @@ import pandas as pd import h5py from policyengine_us_data.storage import STORAGE_FOLDER -from typing import Optional +from typing import Optional, Callable +bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", +] -def create_calibration_log_file(file_path): + +def create_calibration_log_file(file_path, epoch=0): dataset = Dataset.from_file(file_path) loss_matrix, targets = build_loss_matrix(dataset, 2024) + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + assert loss_matrix_clean.shape[1] == targets_clean.size + loss_matrix, targets = build_loss_matrix(dataset, 2024) + bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~bad_mask keep_idx = np.where(keep_mask_bool)[0] @@ -22,6 +41,10 @@ def create_calibration_log_file(file_path): sim = Microsimulation(dataset=dataset) + estimates = ( + sim.calculate("household_weight", 2024).values @ loss_matrix_clean + ) + target_names = loss_matrix_clean.columns estimates = ( sim.calculate("household_weight", 2024).values @ loss_matrix_clean ) @@ -32,9 +55,11 @@ def create_calibration_log_file(file_path): "target_name": target_names, "estimate": estimates, "target": targets_clean, + "target": targets_clean, } ) df["epoch"] = epoch + df["epoch"] = epoch df["error"] = df["estimate"] - df["target"] df["rel_error"] = df["error"] / df["target"] df["abs_error"] = df["error"].abs() @@ -43,6 +68,11 @@ def create_calibration_log_file(file_path): if df["target"].abs().sum() > 0 else np.nan ) + df["rel_abs_error"] = ( + df["abs_error"] / df["target"].abs() + if df["target"].abs().sum() > 0 + else np.nan + ) df["loss"] = (df["rel_error"] ** 2).mean() df.to_csv( @@ -258,6 +288,7 @@ def random_sampling_minimization( targets, normalisation_factor, random=True, + random=True, target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5], ): """A simple random sampling approach""" @@ -265,6 +296,8 @@ def random_sampling_minimization( household_weights_normalized = weights / weights.sum() + household_weights_normalized = weights / weights.sum() + final_mask = None lowest_loss = float("inf") for fraction in target_fractions: @@ -283,6 +316,14 @@ def random_sampling_minimization( replace=False, ) ] = True + mask[ + np.random.choice( + n, + target_size, + p=household_weights_normalized if random else None, + replace=False, + ) + ] = True loss = get_loss_from_mask( weights, mask, estimate_matrix, targets, normalisation_factor @@ -322,6 +363,14 @@ def minimise_dataset( dataset = Dataset.from_file(dataset) loss_matrix, targets = build_loss_matrix(dataset, 2024) + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + assert loss_matrix_clean.shape[1] == targets_clean.size + loss_matrix, targets = build_loss_matrix(dataset, 2024) + bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~bad_mask keep_idx = np.where(keep_mask_bool)[0] @@ -333,6 +382,7 @@ def minimise_dataset( weights = sim.calculate("household_weight", 2024).values is_national = loss_matrix_clean.columns.str.startswith("nation/") + is_national = loss_matrix_clean.columns.str.startswith("nation/") nation_normalisation_factor = is_national * (1 / is_national.sum()) state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) normalisation_factor = np.where( @@ -344,8 +394,11 @@ def minimise_dataset( weights=weights, estimate_matrix=loss_matrix_clean, targets=targets_clean, + estimate_matrix=loss_matrix_clean, + targets=targets_clean, normalisation_factor=normalisation_factor, **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. + **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. ) # Extract household IDs for remaining households @@ -386,7 +439,7 @@ def minimise_dataset( f.create_dataset(f"{variable}/{year}", data=value) print(f"Saved minimised dataset to {output_path}") - create_calibration_log_file(output_path) + create_calibration_log_file(output_path, epoch=500) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 5a75693f..7f3e59b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,9 +17,8 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.11, <3.13.0" dependencies = [ - "policyengine-us>=1.340.1", + "policyengine-us>=1.340.0", "policyengine-core>=3.17.1", - "pandas>=2.3.0", "requests", "tqdm", "microdf_python>=0.4.3", diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 519d2725..5a7a9d15 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -12,15 +12,188 @@ "from policyengine_us import Microsimulation\n", "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", - "import numpy as np" + "import numpy as np\n", + "import os\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "db975ac1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n", + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n", + "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n", + "Removed 4131 households with worst relative loss changes.\n", + "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n", + "Removed 4131 households with worst relative loss changes.\n", + "Weight relative change: 52.19%\n", + "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", + "Targeting Medicaid enrollment for AK with target 231577k\n", + "Targeting Medicaid enrollment for AL with target 766009k\n", + "Targeting Medicaid enrollment for AR with target 733561k\n", + "Targeting Medicaid enrollment for AZ with target 1778734k\n", + "Targeting Medicaid enrollment for CA with target 12172695k\n", + "Targeting Medicaid enrollment for CO with target 1058326k\n", + "Targeting Medicaid enrollment for CT with target 904321k\n", + "Targeting Medicaid enrollment for DC with target 240020k\n", + "Targeting Medicaid enrollment for DE with target 236840k\n", + "Targeting Medicaid enrollment for FL with target 3568648k\n", + "Targeting Medicaid enrollment for GA with target 1699279k\n", + "Targeting Medicaid enrollment for HI with target 376318k\n", + "Targeting Medicaid enrollment for IA with target 586748k\n", + "Targeting Medicaid enrollment for ID with target 296968k\n", + "Targeting Medicaid enrollment for IL with target 2918179k\n", + "Targeting Medicaid enrollment for IN with target 1623361k\n", + "Targeting Medicaid enrollment for KS with target 335902k\n", + "Targeting Medicaid enrollment for KY with target 1244822k\n", + "Targeting Medicaid enrollment for LA with target 1377806k\n", + "Targeting Medicaid enrollment for MA with target 1453344k\n", + "Targeting Medicaid enrollment for MD with target 1280697k\n", + "Targeting Medicaid enrollment for ME with target 322306k\n", + "Targeting Medicaid enrollment for MI with target 2194067k\n", + "Targeting Medicaid enrollment for MN with target 1146667k\n", + "Targeting Medicaid enrollment for MO with target 1118780k\n", + "Targeting Medicaid enrollment for MS with target 514730k\n", + "Targeting Medicaid enrollment for MT with target 193278k\n", + "Targeting Medicaid enrollment for NC with target 2469712k\n", + "Targeting Medicaid enrollment for ND with target 100543k\n", + "Targeting Medicaid enrollment for NE with target 302971k\n", + "Targeting Medicaid enrollment for NH with target 166813k\n", + "Targeting Medicaid enrollment for NJ with target 1506239k\n", + "Targeting Medicaid enrollment for NM with target 686825k\n", + "Targeting Medicaid enrollment for NV with target 713936k\n", + "Targeting Medicaid enrollment for NY with target 5946806k\n", + "Targeting Medicaid enrollment for OH with target 2596879k\n", + "Targeting Medicaid enrollment for OK with target 894911k\n", + "Targeting Medicaid enrollment for OR with target 1123313k\n", + "Targeting Medicaid enrollment for PA with target 2783389k\n", + "Targeting Medicaid enrollment for RI with target 273400k\n", + "Targeting Medicaid enrollment for SC with target 932515k\n", + "Targeting Medicaid enrollment for SD with target 126952k\n", + "Targeting Medicaid enrollment for TN with target 1268904k\n", + "Targeting Medicaid enrollment for TX with target 3821806k\n", + "Targeting Medicaid enrollment for UT with target 300742k\n", + "Targeting Medicaid enrollment for VA with target 1596777k\n", + "Targeting Medicaid enrollment for VT with target 151833k\n", + "Targeting Medicaid enrollment for WA with target 1776116k\n", + "Targeting Medicaid enrollment for WI with target 1108320k\n", + "Targeting Medicaid enrollment for WV with target 467632k\n", + "Targeting Medicaid enrollment for WY with target 57320k\n" + ] + } + ], "source": [ "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", "\n", @@ -28,18 +201,17 @@ " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", " ]\n", "\n", - "approach = \"random_sampling_minimization\" # for which you can specify the fraction\n", - "minimization_function = random_sampling_minimization\n", - "# other minimization function approach is \"candidate_loss_contribution\"\n", + "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n", + "minimization_function = candidate_loss_contribution\n", + "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n", "\n", "for file in files:\n", " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", " minimise_dataset(\n", " file,\n", " output_path,\n", - " loss_rel_change_max=10,\n", " minimization_function=minimization_function, \n", - " target_fractions=[0.5] # remove if switching approach\n", + " #target_fractions=[0.5] # remove if switching approach\n", " )" ] }, @@ -81,6 +253,14 @@ "\n", "data.save_dataset(output_path)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4cf8e89", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From a71530b7b6f2723cfbf54a64f8f28f9d77e6da1d Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Mon, 14 Jul 2025 19:56:54 +0200 Subject: [PATCH 39/56] fix calibration for each approach --- .../datasets/cps/enhanced_cps.py | 1 + policyengine_us_data/utils/loss.py | 5 -- policyengine_us_data/utils/minimise.py | 89 ++++++++++++++----- test_minimization_approach.ipynb | 86 ++++++++++++------ 4 files changed, 129 insertions(+), 52 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index bf4b5501..33f62929 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -39,6 +39,7 @@ def reweight( loss_matrix, targets_array, dropout_rate=0.05, + epochs=500, log_path="calibration_log.csv", penalty_approach=None, ): diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 21abce0f..fbdbacef 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -552,11 +552,6 @@ def build_loss_matrix(dataset: type, time_period): # Convert to thousands for the target targets_array.append(row["enrollment"]) - print( - f"Targeting Medicaid enrollment for {row['state']} " - f"with target {row['enrollment']:.0f}k" - ) - # State 10-year age targets age_targets = pd.read_csv(STORAGE_FOLDER / "age_state.csv") diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 9c3d59eb..84c55d31 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -6,6 +6,7 @@ import h5py from policyengine_us_data.storage import STORAGE_FOLDER from typing import Optional, Callable +from policyengine_us_data.datasets.cps.enhanced_cps import reweight bad_targets = [ "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", @@ -20,35 +21,54 @@ def create_calibration_log_file(file_path, epoch=0): + print(f"=== CALIBRATION LOG DEBUG ===") + print(f"File path: {file_path}") + print(f"Epoch: {epoch}") + dataset = Dataset.from_file(file_path) + sim = Microsimulation(dataset=dataset) - loss_matrix, targets = build_loss_matrix(dataset, 2024) + # Debug: Print dataset info + household_weights = sim.calculate("household_weight", 2024) + print(f"Number of households: {len(household_weights)}") + print(f"Total weight: {household_weights.sum():.2f}") + print( + f"Weight range: {household_weights.min():.2f} to {household_weights.max():.2f}" + ) - bad_mask = loss_matrix.columns.isin(bad_targets) - keep_mask_bool = ~bad_mask - keep_idx = np.where(keep_mask_bool)[0] - loss_matrix_clean = loss_matrix.iloc[:, keep_idx] - targets_clean = targets[keep_idx] - assert loss_matrix_clean.shape[1] == targets_clean.size loss_matrix, targets = build_loss_matrix(dataset, 2024) + print(f"Loss matrix shape: {loss_matrix.shape}") + print(f"Number of targets: {len(targets)}") bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~bad_mask keep_idx = np.where(keep_mask_bool)[0] loss_matrix_clean = loss_matrix.iloc[:, keep_idx] targets_clean = targets[keep_idx] - assert loss_matrix_clean.shape[1] == targets_clean.size - sim = Microsimulation(dataset=dataset) + print(f"After filtering bad targets:") + print(f"Loss matrix clean shape: {loss_matrix_clean.shape}") + print(f"Number of clean targets: {len(targets_clean)}") + + assert loss_matrix_clean.shape[1] == targets_clean.size estimates = ( sim.calculate("household_weight", 2024).values @ loss_matrix_clean ) target_names = loss_matrix_clean.columns - estimates = ( - sim.calculate("household_weight", 2024).values @ loss_matrix_clean - ) - target_names = loss_matrix_clean.columns + + # Debug: Print estimate statistics + print(f"Estimates shape: {estimates.shape}") + print(f"Estimates sum: {estimates.sum():.2f}") + print(f"First 3 estimates: {estimates[:3]}") + print(f"First 3 targets: {targets_clean[:3]}") + + # Calculate and print some key metrics + errors = estimates - targets_clean + rel_errors = errors / targets_clean + print(f"Mean absolute error: {np.abs(errors).mean():.2f}") + print(f"Mean relative error: {np.abs(rel_errors).mean():.4f}") + print(f"=== END DEBUG ===\n") df = pd.DataFrame( { @@ -158,6 +178,7 @@ def get_loss_from_mask( """ Calculate the loss based on the inclusion mask and the estimate matrix. """ + # Step 1: Apply mask and rescale weights masked_weights = weights.copy() original_weight_total = masked_weights.sum() if (~inclusion_mask).sum() > 0: @@ -166,7 +187,26 @@ def get_loss_from_mask( masked_weights[inclusion_mask] *= ( original_weight_total / masked_weight_total ) - estimates = masked_weights @ estimate_matrix + + # Step 2: Re-calibrate the masked weights to hit targets + # Only calibrate the included households + included_weights = masked_weights[inclusion_mask] + included_estimate_matrix = estimate_matrix[inclusion_mask] + + # Call reweight function to calibrate the selected households + calibrated_weights_included = reweight( + included_weights, + included_estimate_matrix, + targets, + epochs=250, + ) + + # Put calibrated weights back into full array + calibrated_weights = np.zeros_like(masked_weights) + calibrated_weights[inclusion_mask] = calibrated_weights_included + + # Calculate estimates and loss from calibrated weights + estimates = calibrated_weights @ estimate_matrix rel_error = ((estimates - targets) + 1) / (targets + 1) loss = ((rel_error * normalisation_factor) ** 2).mean() @@ -288,8 +328,7 @@ def random_sampling_minimization( targets, normalisation_factor, random=True, - random=True, - target_fractions=[0.1, 0.2, 0.3, 0.4, 0.5], + target_fractions=[0.5, 0.6, 0.7, 0.8, 0.9], ): """A simple random sampling approach""" n = len(weights) @@ -306,7 +345,7 @@ def random_sampling_minimization( best_mask = None best_loss = float("inf") - for _ in range(5): # Try 5 random samples + for _ in range(3): # Try 3 random samples mask = np.zeros(n, dtype=bool) mask[ np.random.choice( @@ -419,12 +458,20 @@ def minimise_dataset( sim = Microsimulation(dataset=smaller_df) # Rescale weights to maintain total - sim.set_input( - "household_weight", - 2024, - sim.calculate("household_weight", 2024).values / weight_rel_change, + initial_weights = ( + sim.calculate("household_weight", 2024).values / weight_rel_change ) + # Re-calibrate the final selected households to hit targets + print("Re-calibrating final selected households...") + calibrated_weights = reweight( + initial_weights, + loss_matrix_clean.values, # Convert to numpy array + targets_clean, + epochs=250, # Reduced epochs for faster processing + ) + sim.set_input("household_weight", 2024, calibrated_weights) + print("Final calibration completed successfully") # Prepare data for saving data = {} for variable in sim.input_variables: diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 5a7a9d15..6683da0c 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -13,12 +13,27 @@ "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", "import numpy as np\n", - "import os\n" + "import os" ] }, { "cell_type": "code", +<<<<<<< HEAD "execution_count": 7, +======= + "execution_count": null, + "id": "6daabe7c", + "metadata": {}, + "outputs": [], + "source": [ + "# Original ECPS 2024 dataset size (for household entity): 41310\n", + "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n", + "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "id": "db975ac1", "metadata": {}, "outputs": [ @@ -128,18 +143,17 @@ "Targeting Medicaid enrollment for WI with target 1108320k\n", "Targeting Medicaid enrollment for WV with target 467632k\n", "Targeting Medicaid enrollment for WY with target 57320k\n", - "Iteration 1: Loss changed from 4.565926440883813e-08 to 5.336074793134824e-08\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 2: Loss changed from 5.336074793134824e-08 to 6.613975524012925e-08\n", + "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Iteration 3: Loss changed from 6.613975524012925e-08 to 8.772356346466888e-08\n", + "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Iteration 4: Loss changed from 8.772356346466888e-08 to 1.1406333044735764e-07\n", + "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Iteration 5: Loss changed from 1.1406333044735764e-07 to 1.3666972105275468e-07\n", + "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n", "Removed 4131 households with worst relative loss changes.\n", - "Weight relative change: 52.19%\n", - "Saved minimised dataset to /Users/elenacura/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", + "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n", + "Weight relative change: 99.10%\n", + "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", "Targeting Medicaid enrollment for AK with target 231577k\n", "Targeting Medicaid enrollment for AL with target 766009k\n", "Targeting Medicaid enrollment for AR with target 733561k\n", @@ -203,32 +217,38 @@ "\n", "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n", "minimization_function = candidate_loss_contribution\n", - "# other minimization function approach is \"candidate_loss_contribution\", for which you can specify the tolerance for loss relative change.\n", + "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", "\n", "for file in files:\n", " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + "\n", " minimise_dataset(\n", " file,\n", " output_path,\n", " minimization_function=minimization_function, \n", - " #target_fractions=[0.5] # remove if switching approach\n", + " # target_fractions=[0.5] # remove if switching approach\n", + " loss_rel_change_max=0.0001, # remove if switching approach\n", " )" ] }, { "cell_type": "code", - "execution_count": null, - "id": "35892c9d", + "execution_count": 4, + "id": "b4cf8e89", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250/250 [01:24<00:00, 2.98it/s, loss=3.37e-5, loss_rel_change=-0.92] \n" + ] + } + ], "source": [ - "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", - "\n", "input_dataset = ExtendedCPS_2024\n", "\n", - "approach = \"l0_sigmoid\"\n", - "# other options are \"l0_log\", \"l0_exp\", \"l1\"\n", - "\n", "sim = Microsimulation(dataset=input_dataset)\n", "data = sim.dataset.load_dataset()\n", "data[\"household_weight\"] = {}\n", @@ -240,18 +260,32 @@ " loss_matrix, targets_array = build_loss_matrix(\n", " input_dataset, year\n", " )\n", + "\n", + " bad_mask = loss_matrix.columns.isin(bad_targets)\n", + " keep_mask_bool = ~bad_mask\n", + " keep_idx = np.where(keep_mask_bool)[0]\n", + " loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n", + " targets_array_clean = targets_array[keep_idx]\n", + " assert loss_matrix_clean.shape[1] == targets_array_clean.size\n", + " assert loss_matrix_clean.shape[1] != loss_matrix.shape[1]\n", + "\n", " optimised_weights = reweight(\n", " original_weights,\n", - " loss_matrix,\n", - " targets_array,\n", - " log_path= STORAGE_FOLDER / approach / \"calibration_log.csv\",\n", - " penalty_approach=approach,\n", + " loss_matrix_clean,\n", + " targets_array_clean,\n", + " log_path=\"baseline_calibration_log.csv\",\n", + " epochs=250, # Reduced epochs for faster processing\n", " )\n", " data[\"household_weight\"][year] = optimised_weights\n", "\n", - "output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + "output_path = STORAGE_FOLDER / \"baseline\" / \"enhanced_cps_2024_baseline.h5\"\n", + "output_path.parent.mkdir(parents=True, exist_ok=True)\n", "\n", - "data.save_dataset(output_path)" + "# Save to HDF5 file\n", + "with h5py.File(output_path, \"w\") as f:\n", + " for variable, values in data.items():\n", + " for year, value in values.items():\n", + " f.create_dataset(f\"{variable}/{year}\", data=value)" ] }, { From f146620a9c71761336d7b1c49ae5e54b09f100e4 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Mon, 14 Jul 2025 20:19:38 +0200 Subject: [PATCH 40/56] fixed testing framework --- policyengine_us_data/utils/minimise.py | 39 +-- test_minimization_approach.ipynb | 330 ++++++++++--------------- 2 files changed, 134 insertions(+), 235 deletions(-) diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 84c55d31..b3e0ed1a 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -75,11 +75,9 @@ def create_calibration_log_file(file_path, epoch=0): "target_name": target_names, "estimate": estimates, "target": targets_clean, - "target": targets_clean, } ) df["epoch"] = epoch - df["epoch"] = epoch df["error"] = df["estimate"] - df["target"] df["rel_error"] = df["error"] / df["target"] df["abs_error"] = df["error"].abs() @@ -88,11 +86,6 @@ def create_calibration_log_file(file_path, epoch=0): if df["target"].abs().sum() > 0 else np.nan ) - df["rel_abs_error"] = ( - df["abs_error"] / df["target"].abs() - if df["target"].abs().sum() > 0 - else np.nan - ) df["loss"] = (df["rel_error"] ** 2).mean() df.to_csv( @@ -172,6 +165,7 @@ def minimise_dataset( ) weights @ estimate_matrix + def get_loss_from_mask( weights, inclusion_mask, estimate_matrix, targets, normalisation_factor ): @@ -264,15 +258,6 @@ def candidate_loss_contribution( size=int(full_mask.sum() * view_fraction_per_iteration), replace=False, ) - - # more efficient approach to compute losses for candidate households to be removed - - # 1. sample only households that are currently *included* - indices = np.random.choice( - np.where(full_mask)[0], - size=int(full_mask.sum() * VIEW_FRACTION_PER_ITERATION), - replace=False, - ) # 2. compute losses for the batch in one shot candidate_losses = losses_for_candidates( weights, indices, estimate_matrix, targets, normalisation_factor @@ -335,8 +320,6 @@ def random_sampling_minimization( household_weights_normalized = weights / weights.sum() - household_weights_normalized = weights / weights.sum() - final_mask = None lowest_loss = float("inf") for fraction in target_fractions: @@ -355,14 +338,6 @@ def random_sampling_minimization( replace=False, ) ] = True - mask[ - np.random.choice( - n, - target_size, - p=household_weights_normalized if random else None, - replace=False, - ) - ] = True loss = get_loss_from_mask( weights, mask, estimate_matrix, targets, normalisation_factor @@ -402,14 +377,6 @@ def minimise_dataset( dataset = Dataset.from_file(dataset) loss_matrix, targets = build_loss_matrix(dataset, 2024) - bad_mask = loss_matrix.columns.isin(bad_targets) - keep_mask_bool = ~bad_mask - keep_idx = np.where(keep_mask_bool)[0] - loss_matrix_clean = loss_matrix.iloc[:, keep_idx] - targets_clean = targets[keep_idx] - assert loss_matrix_clean.shape[1] == targets_clean.size - loss_matrix, targets = build_loss_matrix(dataset, 2024) - bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~bad_mask keep_idx = np.where(keep_mask_bool)[0] @@ -421,7 +388,6 @@ def minimise_dataset( weights = sim.calculate("household_weight", 2024).values is_national = loss_matrix_clean.columns.str.startswith("nation/") - is_national = loss_matrix_clean.columns.str.startswith("nation/") nation_normalisation_factor = is_national * (1 / is_national.sum()) state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) normalisation_factor = np.where( @@ -433,11 +399,8 @@ def minimise_dataset( weights=weights, estimate_matrix=loss_matrix_clean, targets=targets_clean, - estimate_matrix=loss_matrix_clean, - targets=targets_clean, normalisation_factor=normalisation_factor, **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. - **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. ) # Extract household IDs for remaining households diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb index 6683da0c..7c416e2a 100644 --- a/test_minimization_approach.ipynb +++ b/test_minimization_approach.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "d6dc9cca", "metadata": {}, "outputs": [], @@ -13,228 +13,172 @@ "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", "from policyengine_us_data.utils import build_loss_matrix\n", "import numpy as np\n", - "import os" + "import os\n", + "import h5py\n", + "\n", + "bad_targets = [\n", + " \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household\",\n", + " \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household\",\n", + " \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n", + " \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n", + " \"nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household\",\n", + " \"nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household\",\n", + " \"nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n", + " \"nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n", + "]" ] }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 7, -======= "execution_count": null, - "id": "6daabe7c", + "id": "683fd57e", "metadata": {}, "outputs": [], "source": [ - "# Original ECPS 2024 dataset size (for household entity): 41310\n", - "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change (for household entity): 20655 \n", - "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change (for household entity): 24786\n" + "# Length of household entity in the dataset measured through household_weight:\n", + "\n", + "# Original ECPS 2024 dataset size: 41310\n", + "# Through \"random_sampling_minimization\" with 0.5 of the dataset being pruned: 20655\n", + "# Through \"random_sampling_minimization\" with 0.2 of the dataset being pruned: 33408\n", + "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change: 20655 \n", + "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change: 24786" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "db975ac1", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n", - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n", - "Iteration 1: Loss changed from 3.2762747622384236e-07 to 3.269240481699973e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 2: Loss changed from 3.269240481699973e-07 to 3.260236288186747e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 3: Loss changed from 3.260236288186747e-07 to 3.258031076364204e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 4: Loss changed from 3.258031076364204e-07 to 3.2572688458943833e-07\n", - "Removed 4131 households with worst relative loss changes.\n", - "Iteration 5: Loss changed from 3.2572688458943833e-07 to 3.2598496400284314e-07, which is too high (0.08%). Stopping.\n", - "Weight relative change: 99.10%\n", - "Saved minimised dataset to /Users/movil1/Desktop/PYTHONJOBS/PolicyEngine/policyengine-us-data/policyengine_us_data/storage/candidate_loss_contribution/enhanced_cps_2024_minimised.h5\n", - "Targeting Medicaid enrollment for AK with target 231577k\n", - "Targeting Medicaid enrollment for AL with target 766009k\n", - "Targeting Medicaid enrollment for AR with target 733561k\n", - "Targeting Medicaid enrollment for AZ with target 1778734k\n", - "Targeting Medicaid enrollment for CA with target 12172695k\n", - "Targeting Medicaid enrollment for CO with target 1058326k\n", - "Targeting Medicaid enrollment for CT with target 904321k\n", - "Targeting Medicaid enrollment for DC with target 240020k\n", - "Targeting Medicaid enrollment for DE with target 236840k\n", - "Targeting Medicaid enrollment for FL with target 3568648k\n", - "Targeting Medicaid enrollment for GA with target 1699279k\n", - "Targeting Medicaid enrollment for HI with target 376318k\n", - "Targeting Medicaid enrollment for IA with target 586748k\n", - "Targeting Medicaid enrollment for ID with target 296968k\n", - "Targeting Medicaid enrollment for IL with target 2918179k\n", - "Targeting Medicaid enrollment for IN with target 1623361k\n", - "Targeting Medicaid enrollment for KS with target 335902k\n", - "Targeting Medicaid enrollment for KY with target 1244822k\n", - "Targeting Medicaid enrollment for LA with target 1377806k\n", - "Targeting Medicaid enrollment for MA with target 1453344k\n", - "Targeting Medicaid enrollment for MD with target 1280697k\n", - "Targeting Medicaid enrollment for ME with target 322306k\n", - "Targeting Medicaid enrollment for MI with target 2194067k\n", - "Targeting Medicaid enrollment for MN with target 1146667k\n", - "Targeting Medicaid enrollment for MO with target 1118780k\n", - "Targeting Medicaid enrollment for MS with target 514730k\n", - "Targeting Medicaid enrollment for MT with target 193278k\n", - "Targeting Medicaid enrollment for NC with target 2469712k\n", - "Targeting Medicaid enrollment for ND with target 100543k\n", - "Targeting Medicaid enrollment for NE with target 302971k\n", - "Targeting Medicaid enrollment for NH with target 166813k\n", - "Targeting Medicaid enrollment for NJ with target 1506239k\n", - "Targeting Medicaid enrollment for NM with target 686825k\n", - "Targeting Medicaid enrollment for NV with target 713936k\n", - "Targeting Medicaid enrollment for NY with target 5946806k\n", - "Targeting Medicaid enrollment for OH with target 2596879k\n", - "Targeting Medicaid enrollment for OK with target 894911k\n", - "Targeting Medicaid enrollment for OR with target 1123313k\n", - "Targeting Medicaid enrollment for PA with target 2783389k\n", - "Targeting Medicaid enrollment for RI with target 273400k\n", - "Targeting Medicaid enrollment for SC with target 932515k\n", - "Targeting Medicaid enrollment for SD with target 126952k\n", - "Targeting Medicaid enrollment for TN with target 1268904k\n", - "Targeting Medicaid enrollment for TX with target 3821806k\n", - "Targeting Medicaid enrollment for UT with target 300742k\n", - "Targeting Medicaid enrollment for VA with target 1596777k\n", - "Targeting Medicaid enrollment for VT with target 151833k\n", - "Targeting Medicaid enrollment for WA with target 1776116k\n", - "Targeting Medicaid enrollment for WI with target 1108320k\n", - "Targeting Medicaid enrollment for WV with target 467632k\n", - "Targeting Medicaid enrollment for WY with target 57320k\n" - ] - } - ], + "outputs": [], "source": [ + "## ALL TESTS\n", + "\n", + "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", + "\n", + "input_dataset = ExtendedCPS_2024\n", + "\n", + "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", + "\n", + "for approach in approaches:\n", + " sim = Microsimulation(dataset=input_dataset)\n", + " data = sim.dataset.load_dataset()\n", + " data[\"household_weight\"] = {}\n", + " original_weights = sim.calculate(\"household_weight\")\n", + " original_weights = original_weights.values + np.random.normal(\n", + " 1, 0.1, len(original_weights)\n", + " )\n", + " for year in range(2024, 2025):\n", + " loss_matrix, targets_array = build_loss_matrix(\n", + " input_dataset, year\n", + " )\n", + "\n", + " bad_mask = loss_matrix.columns.isin(bad_targets)\n", + " keep_mask_bool = ~bad_mask\n", + " keep_idx = np.where(keep_mask_bool)[0]\n", + " loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n", + " targets_array_clean = targets_array[keep_idx]\n", + " assert loss_matrix_clean.shape[1] == targets_array_clean.size\n", + "\n", + " optimised_weights = reweight(\n", + " original_weights,\n", + " loss_matrix_clean,\n", + " targets_array_clean,\n", + " log_path=\"calibration_log.csv\",\n", + " penalty_approach=approach,\n", + " epochs=250, # Reduced epochs for faster processing\n", + " )\n", + " data[\"household_weight\"][year] = optimised_weights\n", + "\n", + " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + "\n", + " # Save to HDF5 file\n", + " with h5py.File(output_path, \"w\") as f:\n", + " for variable, values in data.items():\n", + " for year, value in values.items():\n", + " f.create_dataset(f\"{variable}/{year}\", data=value)\n", + "\n", + "\n", "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", "\n", "files = [\n", " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", " ]\n", "\n", - "approach = \"candidate_loss_contribution\" # for which you can specify the fraction\n", - "minimization_function = candidate_loss_contribution\n", + "approaches = {\n", + " \"random_sampling_minimization\": random_sampling_minimization,\n", + " \"candidate_loss_contribution\": candidate_loss_contribution,\n", + "}\n", + "\n", + "optional_params = {\n", + " \"random_sampling_minimization\": {\n", + " \"target_fractions\": [0.5, 0.6, 0.7, 0.8, 0.9], # fractions of the dataset to keep\n", + " },\n", + " \"candidate_loss_contribution\": {\n", + " \"loss_rel_change_max\": [0.00001, 0.000001, 0.0000001] # maximum relative change in\n", + " }\n", + "}\n", + "\n", + "for approach, function in approaches.items():\n", + " minimization_function = function\n", + " # other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", + "\n", + " for params, values in optional_params[approach].items():\n", + " for value in values:\n", + " if params == \"target_fractions\":\n", + " for file in files:\n", + " output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + " minimise_dataset(\n", + " file,\n", + " output_path,\n", + " minimization_function=minimization_function, \n", + " target_fractions=[value]\n", + " )\n", + " elif params == \"loss_rel_change_max\":\n", + " for file in files:\n", + " output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + " minimise_dataset(\n", + " file,\n", + " output_path,\n", + " minimization_function=minimization_function, \n", + " loss_rel_change_max=value\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35892c9d", + "metadata": {}, + "outputs": [], + "source": [ + "## SMALL CHECKS BELOW -- IGNORE ---\n", + "\n", + "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", + "\n", + "files = [\n", + " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", + " ]\n", + "\n", + "minimization_function = random_sampling_minimization\n", "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", "\n", "for file in files:\n", - " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", + " output_path = STORAGE_FOLDER / \"random_sampling_minimization\" / f\"{1.0}_enhanced_cps_2024_minimised.h5\"\n", " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - "\n", " minimise_dataset(\n", " file,\n", " output_path,\n", " minimization_function=minimization_function, \n", - " # target_fractions=[0.5] # remove if switching approach\n", - " loss_rel_change_max=0.0001, # remove if switching approach\n", + " target_fractions=[1.0]\n", " )" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "b4cf8e89", "metadata": {}, "outputs": [ @@ -287,14 +231,6 @@ " for year, value in values.items():\n", " f.create_dataset(f\"{variable}/{year}\", data=value)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4cf8e89", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 51d9c9c90c632db53fb7b9c0e5fe24f319859b17 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 11:09:12 +0200 Subject: [PATCH 41/56] attempt generating Minimized ECPS --- changelog_entry.yaml | 2 +- .../datasets/cps/enhanced_cps.py | 74 +++-- .../tests/test_datasets/test_enhanced_cps.py | 59 ++++ policyengine_us_data/utils/minimise.py | 89 ++---- test_minimization_approach.ipynb | 257 ------------------ 5 files changed, 140 insertions(+), 341 deletions(-) delete mode 100644 test_minimization_approach.ipynb diff --git a/changelog_entry.yaml b/changelog_entry.yaml index ac664753..725035b9 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,4 @@ - bump: minor changes: added: - - Enhanced CPS minimizing tests. \ No newline at end of file + - Minimized Enhanced CPS. \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 83fe6b99..82aa9f27 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -9,6 +9,10 @@ import numpy as np from typing import Type from policyengine_us_data.storage import STORAGE_FOLDER +from policyengine_us_data.utils.minimise import ( + candidate_loss_contribution, + minimize_dataset, +) from policyengine_us_data.datasets.cps.extended_cps import ( ExtendedCPS_2024, CPS_2019, @@ -231,28 +235,6 @@ def generate(self): 1, 0.1, len(original_weights) ) - bad_targets = [ - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", - "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", - "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", - "state/RI/adjusted_gross_income/amount/-inf_1", - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", - "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", - "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", - "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", - "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", - "state/RI/adjusted_gross_income/amount/-inf_1", - "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", - ] - # Run the optimization procedure to get (close to) minimum loss weights for year in range(self.start_year, self.end_year + 1): loss_matrix, targets_array = build_loss_matrix( @@ -327,6 +309,53 @@ def generate(self): self.save_dataset(data) +class MinimizedEnhancedCPS_2024(Dataset): + input_dataset = ExtendedCPS_2024 + start_year = 2024 + name = "minimized_enhanced_cps_2024" + label = "Minimized Enhanced CPS 2024" + file_path = STORAGE_FOLDER / "minimized_enhanced_cps_2024.h5" + url = ( + "hf://policyengine/policyengine-us-data/minimized_enhanced_cps_2024.h5" + ) + + def generate(self): + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=self.input_dataset) + data = sim.dataset.load_dataset() + data["household_weight"] = {} + original_weights = sim.calculate("household_weight") + original_weights = original_weights.values + np.random.normal( + 1, 0.1, len(original_weights) + ) + + # Run the optimization procedure to get (close to) minimum loss weights + for year in range(self.start_year, self.end_year + 1): + loss_matrix, targets_array = build_loss_matrix( + self.input_dataset, year + ) + + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_array_clean = targets_array[keep_idx] + assert loss_matrix_clean.shape[1] == targets_array_clean.size + + minimize_dataset( + self.input_dataset, + self.file_path, + minimization_function=candidate_loss_contribution, + loss_matrix=loss_matrix_clean, + targets=targets_array_clean, + target_fractions=[0.1], # maximum relative change in loss + count_iterations=5, + view_fraction_per_iteration=0.5, + fraction_remove_per_iteration=0.1, + ) + + class EnhancedCPS_2024(EnhancedCPS): input_dataset = ExtendedCPS_2024 start_year = 2024 @@ -339,3 +368,4 @@ class EnhancedCPS_2024(EnhancedCPS): if __name__ == "__main__": EnhancedCPS_2024().generate() + MinimizedEnhancedCPS_2024().generate() diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index abf67301..c6660f66 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -1,4 +1,5 @@ import pytest +import pandas as pd def test_ecps_has_mortgage_interest(): @@ -254,3 +255,61 @@ def test_medicaid_calibration(): assert ( not failed ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}." + + +def test_minimized_enhanced_cps_calibration_quality(): + """ + Test that minimized Enhanced CPS datasets maintain calibration quality above 75%. + Quality score formula: ((excellentCount * 100 + goodCount * 75) / totalTargets) + + Quality Categories: + - Excellent (< 5% error): 100 points each + - Good (5-20% error): 75 points each + - Poor (≥ 20% error): 0 points each + """ + from policyengine_us_data.datasets.cps import MinimizedEnhancedCPS_2024 + from policyengine_us_data.utils.minimise import create_calibration_log_file + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=MinimizedEnhancedCPS_2024) + assert ( + len(sim.calculate("household_weight")) < 30_000 + ), "Minimized Enhanced CPS should have fewer than 30,000 households." + + create_calibration_log_file(MinimizedEnhancedCPS_2024) + + calibration_log = pd.read_csv( + str(MinimizedEnhancedCPS_2024.file_path).replace( + ".h5", "_calibration_log.csv" + ) + ) + + # Calculate quality categories + excellent_count = ( + calibration_log["rel_abs_error"] < 0.05 + ).sum() # < 5% error + good_count = ( + (calibration_log["rel_abs_error"] >= 0.05) + & (calibration_log["rel_abs_error"] < 0.20) + ).sum() # 5-20% error + poor_count = ( + calibration_log["rel_abs_error"] >= 0.20 + ).sum() # ≥ 20% error + total_targets = len(calibration_log) + + # Calculate quality score + quality_score = (excellent_count * 100 + good_count * 75) / total_targets + + print(f" Total targets: {total_targets}") + print(f" Excellent (< 5% error): {excellent_count}") + print(f" Good (5-20% error): {good_count}") + print(f" Poor (≥ 20% error): {poor_count}") + print(f" Quality score: {quality_score:.1f}%") + + # Assert quality score is above 75% + assert quality_score >= 75.0, ( + f"Calibration quality score {quality_score:.1f}% is below 75% threshold " + f"for {MinimizedEnhancedCPS_2024.label}. " + f"Breakdown: {excellent_count} excellent, {good_count} good, {poor_count} poor " + f"out of {total_targets} total targets." + ) diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index b3e0ed1a..2048ce61 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -21,24 +21,10 @@ def create_calibration_log_file(file_path, epoch=0): - print(f"=== CALIBRATION LOG DEBUG ===") - print(f"File path: {file_path}") - print(f"Epoch: {epoch}") - dataset = Dataset.from_file(file_path) sim = Microsimulation(dataset=dataset) - # Debug: Print dataset info - household_weights = sim.calculate("household_weight", 2024) - print(f"Number of households: {len(household_weights)}") - print(f"Total weight: {household_weights.sum():.2f}") - print( - f"Weight range: {household_weights.min():.2f} to {household_weights.max():.2f}" - ) - loss_matrix, targets = build_loss_matrix(dataset, 2024) - print(f"Loss matrix shape: {loss_matrix.shape}") - print(f"Number of targets: {len(targets)}") bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~bad_mask @@ -46,10 +32,6 @@ def create_calibration_log_file(file_path, epoch=0): loss_matrix_clean = loss_matrix.iloc[:, keep_idx] targets_clean = targets[keep_idx] - print(f"After filtering bad targets:") - print(f"Loss matrix clean shape: {loss_matrix_clean.shape}") - print(f"Number of clean targets: {len(targets_clean)}") - assert loss_matrix_clean.shape[1] == targets_clean.size estimates = ( @@ -57,18 +39,9 @@ def create_calibration_log_file(file_path, epoch=0): ) target_names = loss_matrix_clean.columns - # Debug: Print estimate statistics - print(f"Estimates shape: {estimates.shape}") - print(f"Estimates sum: {estimates.sum():.2f}") - print(f"First 3 estimates: {estimates[:3]}") - print(f"First 3 targets: {targets_clean[:3]}") - # Calculate and print some key metrics errors = estimates - targets_clean rel_errors = errors / targets_clean - print(f"Mean absolute error: {np.abs(errors).mean():.2f}") - print(f"Mean relative error: {np.abs(rel_errors).mean():.4f}") - print(f"=== END DEBUG ===\n") df = pd.DataFrame( { @@ -144,28 +117,6 @@ def losses_for_candidates( return losses -def minimise_dataset( - dataset, output_path: str, loss_rel_change_max: float -) -> None: - dataset = str(dataset) - create_calibration_log_file(dataset) - - dataset = Dataset.from_file(dataset) - loss_matrix = build_loss_matrix(dataset, 2024) - - sim = Microsimulation(dataset=dataset) - - weights = sim.calculate("household_weight", 2024).values - estimate_matrix, targets = loss_matrix - is_national = estimate_matrix.columns.str.startswith("nation/") - nation_normalisation_factor = is_national * (1 / is_national.sum()) - state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) - normalisation_factor = np.where( - is_national, nation_normalisation_factor, state_normalisation_factor - ) - weights @ estimate_matrix - - def get_loss_from_mask( weights, inclusion_mask, estimate_matrix, targets, normalisation_factor ): @@ -185,7 +136,9 @@ def get_loss_from_mask( # Step 2: Re-calibrate the masked weights to hit targets # Only calibrate the included households included_weights = masked_weights[inclusion_mask] - included_estimate_matrix = estimate_matrix[inclusion_mask] + included_estimate_matrix = estimate_matrix.iloc[ + inclusion_mask + ] # Keep as DataFrame # Call reweight function to calibrate the selected households calibrated_weights_included = reweight( @@ -354,10 +307,12 @@ def random_sampling_minimization( return final_mask -def minimise_dataset( +def minimize_dataset( dataset, output_path: str, minimization_function: Callable = candidate_loss_contribution, + loss_matrix: Optional[pd.DataFrame] = None, + targets: Optional[np.ndarray] = None, **kwargs, ) -> None: """ @@ -375,14 +330,15 @@ def minimise_dataset( create_calibration_log_file(dataset) dataset = Dataset.from_file(dataset) - loss_matrix, targets = build_loss_matrix(dataset, 2024) + if loss_matrix is None or targets is None: + loss_matrix, targets = build_loss_matrix(dataset, 2024) - bad_mask = loss_matrix.columns.isin(bad_targets) - keep_mask_bool = ~bad_mask - keep_idx = np.where(keep_mask_bool)[0] - loss_matrix_clean = loss_matrix.iloc[:, keep_idx] - targets_clean = targets[keep_idx] - assert loss_matrix_clean.shape[1] == targets_clean.size + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + assert loss_matrix_clean.shape[1] == targets_clean.size sim = Microsimulation(dataset=dataset) @@ -427,10 +383,21 @@ def minimise_dataset( # Re-calibrate the final selected households to hit targets print("Re-calibrating final selected households...") + + # Build loss matrix for the smaller dataset + smaller_loss_matrix, smaller_targets = build_loss_matrix(sim.dataset, 2024) + + # Apply same filtering as before + bad_mask = smaller_loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + smaller_loss_matrix_clean = smaller_loss_matrix.iloc[:, keep_idx] + smaller_targets_clean = smaller_targets[keep_idx] + calibrated_weights = reweight( initial_weights, - loss_matrix_clean.values, # Convert to numpy array - targets_clean, + smaller_loss_matrix_clean, # Now matches the smaller dataset size + smaller_targets_clean, epochs=250, # Reduced epochs for faster processing ) sim.set_input("household_weight", 2024, calibrated_weights) @@ -460,7 +427,7 @@ def minimise_dataset( for file in files: output_path = file.with_name(file.stem + "_minimised.h5") - minimise_dataset( + minimize_dataset( file, output_path, ) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb deleted file mode 100644 index 7c416e2a..00000000 --- a/test_minimization_approach.ipynb +++ /dev/null @@ -1,257 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "id": "d6dc9cca", - "metadata": {}, - "outputs": [], - "source": [ - "from policyengine_us_data.utils.minimise import minimise_dataset, random_sampling_minimization, candidate_loss_contribution\n", - "from policyengine_us_data.storage import STORAGE_FOLDER\n", - "from policyengine_us import Microsimulation\n", - "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", - "from policyengine_us_data.utils import build_loss_matrix\n", - "import numpy as np\n", - "import os\n", - "import h5py\n", - "\n", - "bad_targets = [\n", - " \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household\",\n", - " \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household\",\n", - " \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n", - " \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n", - " \"nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household\",\n", - " \"nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household\",\n", - " \"nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n", - " \"nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "683fd57e", - "metadata": {}, - "outputs": [], - "source": [ - "# Length of household entity in the dataset measured through household_weight:\n", - "\n", - "# Original ECPS 2024 dataset size: 41310\n", - "# Through \"random_sampling_minimization\" with 0.5 of the dataset being pruned: 20655\n", - "# Through \"random_sampling_minimization\" with 0.2 of the dataset being pruned: 33408\n", - "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change: 20655 \n", - "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change: 24786" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db975ac1", - "metadata": {}, - "outputs": [], - "source": [ - "## ALL TESTS\n", - "\n", - "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", - "\n", - "input_dataset = ExtendedCPS_2024\n", - "\n", - "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", - "\n", - "for approach in approaches:\n", - " sim = Microsimulation(dataset=input_dataset)\n", - " data = sim.dataset.load_dataset()\n", - " data[\"household_weight\"] = {}\n", - " original_weights = sim.calculate(\"household_weight\")\n", - " original_weights = original_weights.values + np.random.normal(\n", - " 1, 0.1, len(original_weights)\n", - " )\n", - " for year in range(2024, 2025):\n", - " loss_matrix, targets_array = build_loss_matrix(\n", - " input_dataset, year\n", - " )\n", - "\n", - " bad_mask = loss_matrix.columns.isin(bad_targets)\n", - " keep_mask_bool = ~bad_mask\n", - " keep_idx = np.where(keep_mask_bool)[0]\n", - " loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n", - " targets_array_clean = targets_array[keep_idx]\n", - " assert loss_matrix_clean.shape[1] == targets_array_clean.size\n", - "\n", - " optimised_weights = reweight(\n", - " original_weights,\n", - " loss_matrix_clean,\n", - " targets_array_clean,\n", - " log_path=\"calibration_log.csv\",\n", - " penalty_approach=approach,\n", - " epochs=250, # Reduced epochs for faster processing\n", - " )\n", - " data[\"household_weight\"][year] = optimised_weights\n", - "\n", - " output_path = STORAGE_FOLDER / approach / \"enhanced_cps_2024_minimised.h5\"\n", - " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - "\n", - " # Save to HDF5 file\n", - " with h5py.File(output_path, \"w\") as f:\n", - " for variable, values in data.items():\n", - " for year, value in values.items():\n", - " f.create_dataset(f\"{variable}/{year}\", data=value)\n", - "\n", - "\n", - "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", - "\n", - "files = [\n", - " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", - " ]\n", - "\n", - "approaches = {\n", - " \"random_sampling_minimization\": random_sampling_minimization,\n", - " \"candidate_loss_contribution\": candidate_loss_contribution,\n", - "}\n", - "\n", - "optional_params = {\n", - " \"random_sampling_minimization\": {\n", - " \"target_fractions\": [0.5, 0.6, 0.7, 0.8, 0.9], # fractions of the dataset to keep\n", - " },\n", - " \"candidate_loss_contribution\": {\n", - " \"loss_rel_change_max\": [0.00001, 0.000001, 0.0000001] # maximum relative change in\n", - " }\n", - "}\n", - "\n", - "for approach, function in approaches.items():\n", - " minimization_function = function\n", - " # other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", - "\n", - " for params, values in optional_params[approach].items():\n", - " for value in values:\n", - " if params == \"target_fractions\":\n", - " for file in files:\n", - " output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n", - " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - " minimise_dataset(\n", - " file,\n", - " output_path,\n", - " minimization_function=minimization_function, \n", - " target_fractions=[value]\n", - " )\n", - " elif params == \"loss_rel_change_max\":\n", - " for file in files:\n", - " output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n", - " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - " minimise_dataset(\n", - " file,\n", - " output_path,\n", - " minimization_function=minimization_function, \n", - " loss_rel_change_max=value\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35892c9d", - "metadata": {}, - "outputs": [], - "source": [ - "## SMALL CHECKS BELOW -- IGNORE ---\n", - "\n", - "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", - "\n", - "files = [\n", - " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", - " ]\n", - "\n", - "minimization_function = random_sampling_minimization\n", - "# other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", - "\n", - "for file in files:\n", - " output_path = STORAGE_FOLDER / \"random_sampling_minimization\" / f\"{1.0}_enhanced_cps_2024_minimised.h5\"\n", - " output_path.parent.mkdir(parents=True, exist_ok=True)\n", - " minimise_dataset(\n", - " file,\n", - " output_path,\n", - " minimization_function=minimization_function, \n", - " target_fractions=[1.0]\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4cf8e89", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250/250 [01:24<00:00, 2.98it/s, loss=3.37e-5, loss_rel_change=-0.92] \n" - ] - } - ], - "source": [ - "input_dataset = ExtendedCPS_2024\n", - "\n", - "sim = Microsimulation(dataset=input_dataset)\n", - "data = sim.dataset.load_dataset()\n", - "data[\"household_weight\"] = {}\n", - "original_weights = sim.calculate(\"household_weight\")\n", - "original_weights = original_weights.values + np.random.normal(\n", - " 1, 0.1, len(original_weights)\n", - ")\n", - "for year in range(2024, 2025):\n", - " loss_matrix, targets_array = build_loss_matrix(\n", - " input_dataset, year\n", - " )\n", - "\n", - " bad_mask = loss_matrix.columns.isin(bad_targets)\n", - " keep_mask_bool = ~bad_mask\n", - " keep_idx = np.where(keep_mask_bool)[0]\n", - " loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n", - " targets_array_clean = targets_array[keep_idx]\n", - " assert loss_matrix_clean.shape[1] == targets_array_clean.size\n", - " assert loss_matrix_clean.shape[1] != loss_matrix.shape[1]\n", - "\n", - " optimised_weights = reweight(\n", - " original_weights,\n", - " loss_matrix_clean,\n", - " targets_array_clean,\n", - " log_path=\"baseline_calibration_log.csv\",\n", - " epochs=250, # Reduced epochs for faster processing\n", - " )\n", - " data[\"household_weight\"][year] = optimised_weights\n", - "\n", - "output_path = STORAGE_FOLDER / \"baseline\" / \"enhanced_cps_2024_baseline.h5\"\n", - "output_path.parent.mkdir(parents=True, exist_ok=True)\n", - "\n", - "# Save to HDF5 file\n", - "with h5py.File(output_path, \"w\") as f:\n", - " for variable, values in data.items():\n", - " for year, value in values.items():\n", - " f.create_dataset(f\"{variable}/{year}\", data=value)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pe", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 9f0266578d91bdc8a682018b9c0d7b4e73f84e4b Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 11:11:49 +0200 Subject: [PATCH 42/56] load artifact in workflows --- .github/workflows/code_changes.yaml | 5 +++++ .github/workflows/pr_code_changes.yaml | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index b752e953..908dd887 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -56,6 +56,11 @@ jobs: with: name: calibration_log.csv path: calibration_log.csv + - name: Save minimized ECPS calibration log + uses: actions/upload-artifact@v4 + with: + name: minimized_enhanced_cps_2024_calibration_log.csv + path: minimized_enhanced_cps_2024_calibration_log.csv - name: Run tests run: pytest - name: Upload data diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 4e30d089..524d712c 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -82,6 +82,11 @@ jobs: with: name: calibration_log.csv path: calibration_log.csv + - name: Save minimized ECPS calibration log + uses: actions/upload-artifact@v4 + with: + name: minimized_enhanced_cps_2024_calibration_log.csv + path: minimized_enhanced_cps_2024_calibration_log.csv - name: Run tests run: pytest From fdd2e5285f8200135f652f3b8373972482437cad Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 11:20:08 +0200 Subject: [PATCH 43/56] fix importing errors --- policyengine_us_data/datasets/cps/enhanced_cps.py | 9 +++++---- policyengine_us_data/utils/minimise.py | 5 ++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 82aa9f27..a27264d8 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -9,10 +9,6 @@ import numpy as np from typing import Type from policyengine_us_data.storage import STORAGE_FOLDER -from policyengine_us_data.utils.minimise import ( - candidate_loss_contribution, - minimize_dataset, -) from policyengine_us_data.datasets.cps.extended_cps import ( ExtendedCPS_2024, CPS_2019, @@ -343,6 +339,11 @@ def generate(self): targets_array_clean = targets_array[keep_idx] assert loss_matrix_clean.shape[1] == targets_array_clean.size + from policyengine_us_data.utils.minimise import ( + candidate_loss_contribution, + minimize_dataset, + ) + minimize_dataset( self.input_dataset, self.file_path, diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimise.py index 2048ce61..17461a07 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimise.py @@ -6,7 +6,6 @@ import h5py from policyengine_us_data.storage import STORAGE_FOLDER from typing import Optional, Callable -from policyengine_us_data.datasets.cps.enhanced_cps import reweight bad_targets = [ "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", @@ -141,6 +140,8 @@ def get_loss_from_mask( ] # Keep as DataFrame # Call reweight function to calibrate the selected households + from policyengine_us_data.datasets.cps.enhanced_cps import reweight + calibrated_weights_included = reweight( included_weights, included_estimate_matrix, @@ -394,6 +395,8 @@ def minimize_dataset( smaller_loss_matrix_clean = smaller_loss_matrix.iloc[:, keep_idx] smaller_targets_clean = smaller_targets[keep_idx] + from policyengine_us_data.datasets.cps.enhanced_cps import reweight + calibrated_weights = reweight( initial_weights, smaller_loss_matrix_clean, # Now matches the smaller dataset size From a87a0b9d12c6222e1806930b89cab236534e3763 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 11:38:33 +0200 Subject: [PATCH 44/56] fix dataset initialization error --- policyengine_us_data/datasets/cps/enhanced_cps.py | 3 ++- policyengine_us_data/utils/{minimise.py => minimize.py} | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) rename policyengine_us_data/utils/{minimise.py => minimize.py} (99%) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index a27264d8..195cc173 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -305,9 +305,10 @@ def generate(self): self.save_dataset(data) -class MinimizedEnhancedCPS_2024(Dataset): +class MinimizedEnhancedCPS_2024(EnhancedCPS): input_dataset = ExtendedCPS_2024 start_year = 2024 + end_year = 2024 name = "minimized_enhanced_cps_2024" label = "Minimized Enhanced CPS 2024" file_path = STORAGE_FOLDER / "minimized_enhanced_cps_2024.h5" diff --git a/policyengine_us_data/utils/minimise.py b/policyengine_us_data/utils/minimize.py similarity index 99% rename from policyengine_us_data/utils/minimise.py rename to policyengine_us_data/utils/minimize.py index 17461a07..6e61daff 100644 --- a/policyengine_us_data/utils/minimise.py +++ b/policyengine_us_data/utils/minimize.py @@ -419,7 +419,7 @@ def minimize_dataset( f.create_dataset(f"{variable}/{year}", data=value) print(f"Saved minimised dataset to {output_path}") - create_calibration_log_file(output_path, epoch=500) + create_calibration_log_file(output_path, epoch=250) if __name__ == "__main__": From 6f78752770aa7c62b8bb906dffeb398c6133331f Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 11:39:37 +0200 Subject: [PATCH 45/56] and imports --- policyengine_us_data/tests/test_datasets/test_enhanced_cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index c6660f66..7c815880 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -268,7 +268,7 @@ def test_minimized_enhanced_cps_calibration_quality(): - Poor (≥ 20% error): 0 points each """ from policyengine_us_data.datasets.cps import MinimizedEnhancedCPS_2024 - from policyengine_us_data.utils.minimise import create_calibration_log_file + from policyengine_us_data.utils.minimize import create_calibration_log_file from policyengine_us import Microsimulation sim = Microsimulation(dataset=MinimizedEnhancedCPS_2024) From 9d0c9e19d651ecf9c72f91e1924141c9522472d2 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 11:47:06 +0200 Subject: [PATCH 46/56] attempting to fix data download validation error --- .github/workflows/pr_code_changes.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 524d712c..678d7d0d 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -71,6 +71,7 @@ jobs: run: make download env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Build datasets run: make data From 340dc6b4b243f0fca13e368f3b7f31e27e3fcb71 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 12:07:51 +0200 Subject: [PATCH 47/56] minor bug --- policyengine_us_data/datasets/cps/enhanced_cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 195cc173..ac6f01dc 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -340,7 +340,7 @@ def generate(self): targets_array_clean = targets_array[keep_idx] assert loss_matrix_clean.shape[1] == targets_array_clean.size - from policyengine_us_data.utils.minimise import ( + from policyengine_us_data.utils.minimize import ( candidate_loss_contribution, minimize_dataset, ) From c03eb49ecc993729d547685e745d1b852de32327 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 12:33:24 +0200 Subject: [PATCH 48/56] fix dataset path --- policyengine_us_data/utils/minimize.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/policyengine_us_data/utils/minimize.py b/policyengine_us_data/utils/minimize.py index 6e61daff..199d6fb4 100644 --- a/policyengine_us_data/utils/minimize.py +++ b/policyengine_us_data/utils/minimize.py @@ -327,10 +327,15 @@ def minimize_dataset( minimization_function : function that implements the minimization logic **kwargs : additional arguments to pass to the minimization function """ - dataset = str(dataset) - create_calibration_log_file(dataset) + # Handle both dataset class and file path + if hasattr(dataset, "file_path"): + dataset_path = str(dataset.file_path) + else: + dataset_path = str(dataset) - dataset = Dataset.from_file(dataset) + create_calibration_log_file(dataset_path) + + dataset = Dataset.from_file(dataset_path) if loss_matrix is None or targets is None: loss_matrix, targets = build_loss_matrix(dataset, 2024) From 6007db2fbdbe294e8a831fc8f622bd77278c61c7 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 13:03:12 +0200 Subject: [PATCH 49/56] fix minimize.py variables --- policyengine_us_data/utils/minimize.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/policyengine_us_data/utils/minimize.py b/policyengine_us_data/utils/minimize.py index 199d6fb4..0c4d06b6 100644 --- a/policyengine_us_data/utils/minimize.py +++ b/policyengine_us_data/utils/minimize.py @@ -345,6 +345,9 @@ def minimize_dataset( loss_matrix_clean = loss_matrix.iloc[:, keep_idx] targets_clean = targets[keep_idx] assert loss_matrix_clean.shape[1] == targets_clean.size + else: + loss_matrix_clean = loss_matrix + targets_clean = targets sim = Microsimulation(dataset=dataset) From 171d0726d4472687a88c7a3ec50e6e02e3310452 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 14:10:43 +0200 Subject: [PATCH 50/56] change params --- policyengine_us_data/datasets/cps/enhanced_cps.py | 8 ++++---- policyengine_us_data/utils/minimize.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index ac6f01dc..39c93f49 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -351,9 +351,9 @@ def generate(self): minimization_function=candidate_loss_contribution, loss_matrix=loss_matrix_clean, targets=targets_array_clean, - target_fractions=[0.1], # maximum relative change in loss - count_iterations=5, - view_fraction_per_iteration=0.5, + loss_rel_change_max=[0.1], # maximum relative change in loss + count_iterations=6, + view_fraction_per_iteration=0.4, fraction_remove_per_iteration=0.1, ) @@ -369,5 +369,5 @@ class EnhancedCPS_2024(EnhancedCPS): if __name__ == "__main__": - EnhancedCPS_2024().generate() + # EnhancedCPS_2024().generate() MinimizedEnhancedCPS_2024().generate() diff --git a/policyengine_us_data/utils/minimize.py b/policyengine_us_data/utils/minimize.py index 0c4d06b6..8575470a 100644 --- a/policyengine_us_data/utils/minimize.py +++ b/policyengine_us_data/utils/minimize.py @@ -167,9 +167,9 @@ def candidate_loss_contribution( targets: np.ndarray, normalisation_factor: np.ndarray, loss_rel_change_max: float, - count_iterations: int = 5, - view_fraction_per_iteration: float = 0.3, - fraction_remove_per_iteration: float = 0.1, + count_iterations: int = 10, + view_fraction_per_iteration: float = 0.5, + fraction_remove_per_iteration: float = 0.05, ) -> np.ndarray: """ Minimization approach based on candidate loss contribution. From 1e235814f1d3301adea65d70b9a7a5f1e247cbb5 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 14:11:31 +0200 Subject: [PATCH 51/56] round 2 --- policyengine_us_data/datasets/cps/enhanced_cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 39c93f49..915b0d04 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -369,5 +369,5 @@ class EnhancedCPS_2024(EnhancedCPS): if __name__ == "__main__": - # EnhancedCPS_2024().generate() + EnhancedCPS_2024().generate() MinimizedEnhancedCPS_2024().generate() From 8119f7ca80865cb3a0c9748555900161f1dca915 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 16:30:53 +0200 Subject: [PATCH 52/56] check if sparse l0 approach to minimizing works --- .../datasets/cps/enhanced_cps.py | 364 ++++++++++++------ .../datasets/cps/small_enhanced_cps.py | 108 ++++++ .../test_datasets/test_sparse_enhanced_cps.py | 85 ++++ policyengine_us_data/utils/__init__.py | 1 + policyengine_us_data/utils/l0.py | 208 ++++++++++ 5 files changed, 653 insertions(+), 113 deletions(-) create mode 100644 policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py create mode 100644 policyengine_us_data/utils/l0.py diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 915b0d04..38e9fad0 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -1,10 +1,8 @@ from policyengine_core.data import Dataset import pandas as pd from policyengine_us_data.utils import ( - pe_to_soi, - get_soi, build_loss_matrix, - fmt, + HardConcrete, ) import numpy as np from typing import Type @@ -15,6 +13,10 @@ CPS_2024, ) import os +import logging + +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) try: import torch @@ -41,133 +43,232 @@ def reweight( dropout_rate=0.05, epochs=500, log_path="calibration_log.csv", - penalty_approach=None, + l0_lambda=1e-5, + init_mean=0.999, + temperature=0.5, + sparse=False, ): + if loss_matrix.shape[1] == 0: + raise ValueError("loss_matrix has no columns after filtering") + + # Store column names before converting to tensor target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") - loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32) + + # Keep numpy versions for final diagnostics + loss_matrix_numpy = loss_matrix.values + targets_array_numpy = np.array(targets_array) + + # Convert to tensors for training + loss_matrix_tensor = torch.tensor(loss_matrix_numpy, dtype=torch.float32) + targets_array_tensor = torch.tensor( + targets_array_numpy, dtype=torch.float32 + ) + + # Compute normalization factors nation_normalisation_factor = is_national * (1 / is_national.sum()) state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) normalisation_factor = np.where( is_national, nation_normalisation_factor, state_normalisation_factor ) - normalisation_factor = torch.tensor( + normalisation_factor_tensor = torch.tensor( normalisation_factor, dtype=torch.float32 ) - targets_array = torch.tensor(targets_array, dtype=torch.float32) + inv_mean_normalisation = 1 / np.mean(normalisation_factor) + + # Initialize weights weights = torch.tensor( np.log(original_weights), requires_grad=True, dtype=torch.float32 ) - # TO DO: replace this with a call to the python reweight.py package. - def loss(weights, penalty_approach=penalty_approach): - # Check for Nans in either the weights or the loss matrix + def loss(weights): if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") - if torch.isnan(loss_matrix).any(): + if torch.isnan(loss_matrix_tensor).any(): raise ValueError("Loss matrix contains NaNs") - estimate = weights @ loss_matrix + + estimate = weights @ loss_matrix_tensor + if torch.isnan(estimate).any(): raise ValueError("Estimate contains NaNs") + rel_error = ( - ((estimate - targets_array) + 1) / (targets_array + 1) + ((estimate - targets_array_tensor) + 1) + / (targets_array_tensor + 1) ) ** 2 - rel_error_normalized = rel_error * normalisation_factor + rel_error_normalized = ( + inv_mean_normalisation * rel_error * normalisation_factor_tensor + ) if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") - if penalty_approach is not None: - # L0 penalty (approximated with smooth function) - # Since L0 is non-differentiable, we use a smooth approximation - # Common approaches: - - epsilon = 1e-3 # Threshold for "near zero" - l0_penalty_weight = 1e-1 # Adjust this hyperparameter - - # Option 1: Sigmoid approximation - if penalty_approach == "l0_sigmoid": - smoothed_l0 = torch.sigmoid( - (weights - epsilon) / (epsilon * 0.1) - ).mean() - - # Option 2: Log-sum penalty (smoother) - if penalty_approach == "l0_log": - smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len( - weights - ) - - # Option 3: Exponential penalty - if penalty_approach == "l0_exp": - smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() - - # L1 penalty - l1_penalty_weight = 1e-2 # Adjust this hyperparameterxs - - if penalty_approach == "l1": - l1 = torch.mean(weights) - return rel_error_normalized.mean() + l1_penalty_weight * l1 - - return ( - rel_error_normalized.mean() + l0_penalty_weight * smoothed_l0 - ) - - else: - return rel_error_normalized.mean() + return rel_error_normalized.mean() def dropout_weights(weights, p): if p == 0: return weights - # Replace p% of the weights with the mean value of the rest of them mask = torch.rand_like(weights) < p mean = weights[~mask].mean() masked_weights = weights.clone() masked_weights[mask] = mean return masked_weights - optimizer = torch.optim.Adam([weights], lr=3e-1) - from tqdm import trange - - start_loss = None - - iterator = trange(epochs) - performance = pd.DataFrame() - for i in iterator: - optimizer.zero_grad() - weights_ = dropout_weights(weights, dropout_rate) - l = loss(torch.exp(weights_)) - if (log_path is not None) and (i % 10 == 0): - estimates = torch.exp(weights) @ loss_matrix - estimates = estimates.detach().numpy() - df = pd.DataFrame( - { - "target_name": target_names, - "estimate": estimates, - "target": targets_array.detach().numpy(), - } + def compute_diagnostics(final_weights, label=""): + """Helper function to compute and log diagnostics""" + estimate = final_weights @ loss_matrix_numpy + rel_error = ( + ((estimate - targets_array_numpy) + 1) / (targets_array_numpy + 1) + ) ** 2 + within_10_percent_mask = np.abs(estimate - targets_array_numpy) <= ( + 0.10 * np.abs(targets_array_numpy) + ) + percent_within_10 = np.mean(within_10_percent_mask) * 100 + + logger.info( + f"\n\n---{label} Solutions: reweighting quick diagnostics----\n" + ) + logger.info( + f"{np.sum(final_weights == 0)} are zero, {np.sum(final_weights != 0)} weights are nonzero" + ) + logger.info( + f"rel_error: min: {np.min(rel_error):.2f}\n" + f"max: {np.max(rel_error):.2f}\n" + f"mean: {np.mean(rel_error):.2f}\n" + f"median: {np.median(rel_error):.2f}\n" + f"Within 10% of target: {percent_within_10:.2f}%" + ) + logger.info("Relative error over 100% for:") + for i in np.where(rel_error > 1)[0]: + logger.info(f"target_name: {target_names[i]}") + logger.info(f"target_value: {targets_array_numpy[i]}") + logger.info(f"estimate_value: {estimate[i]}") + logger.info(f"has rel_error: {rel_error[i]:.2f}\n") + logger.info("---End of reweighting quick diagnostics------") + + if not sparse: + # Dense training + optimizer = torch.optim.Adam([weights], lr=3e-1) + from tqdm import trange + + start_loss = None + iterator = trange(epochs) + performance = pd.DataFrame() + + for i in iterator: + optimizer.zero_grad() + weights_ = dropout_weights(weights, dropout_rate) + l = loss(torch.exp(weights_)) + + if (log_path is not None) and (i % 10 == 0): + with torch.no_grad(): + estimates = ( + torch.exp(weights) @ loss_matrix_tensor + ).numpy() + df = pd.DataFrame( + { + "target_name": target_names, + "estimate": estimates, + "target": targets_array_numpy, + } + ) + df["epoch"] = i + df["error"] = df.estimate - df.target + df["rel_error"] = df.error / df.target + df["abs_error"] = df.error.abs() + df["rel_abs_error"] = df.rel_error.abs() + df["loss"] = df.rel_abs_error**2 + performance = pd.concat([performance, df], ignore_index=True) + + if (log_path is not None) and (i % 1000 == 0): + performance.to_csv(log_path, index=False) + + if start_loss is None: + start_loss = l.item() + loss_rel_change = (l.item() - start_loss) / start_loss + + l.backward() + iterator.set_postfix( + {"loss": l.item(), "loss_rel_change": loss_rel_change} ) - df["epoch"] = i - df["error"] = df.estimate - df.target - df["rel_error"] = df.error / df.target - df["abs_error"] = df.error.abs() - df["rel_abs_error"] = df.rel_error.abs() - df["loss"] = df.rel_abs_error**2 - performance = pd.concat([performance, df], ignore_index=True) - - if (log_path is not None) and (i % 1000 == 0): + optimizer.step() + + if log_path is not None: performance.to_csv(log_path, index=False) - if start_loss is None: - start_loss = l.item() - loss_rel_change = (l.item() - start_loss) / start_loss - l.backward() - iterator.set_postfix( - {"loss": l.item(), "loss_rel_change": loss_rel_change} + + final_weights_dense = torch.exp(weights).detach().numpy() + compute_diagnostics(final_weights_dense, "Dense") + return final_weights_dense + + else: + # Sparse training + weights = torch.tensor( + np.log(original_weights), requires_grad=True, dtype=torch.float32 ) - optimizer.step() + gates = HardConcrete( + len(original_weights), init_mean=init_mean, temperature=temperature + ) + + optimizer = torch.optim.Adam( + [weights] + list(gates.parameters()), lr=3e-1 + ) + from tqdm import trange + + start_loss = None + iterator = trange(epochs) + performance = pd.DataFrame() + + for i in iterator: + optimizer.zero_grad() + weights_ = dropout_weights(weights, dropout_rate) + masked = torch.exp(weights_) * gates() + l_main = loss(masked) + l = l_main + l0_lambda * gates.get_penalty() + + if (log_path is not None) and (i % 10 == 0): + gates.eval() + with torch.no_grad(): + estimates = ( + (torch.exp(weights) * gates()) @ loss_matrix_tensor + ).numpy() + gates.train() + + df = pd.DataFrame( + { + "target_name": target_names, + "estimate": estimates, + "target": targets_array_numpy, + } + ) + df["epoch"] = i + df["error"] = df.estimate - df.target + df["rel_error"] = df.error / df.target + df["abs_error"] = df.error.abs() + df["rel_abs_error"] = df.rel_error.abs() + df["loss"] = df.rel_abs_error**2 + performance = pd.concat([performance, df], ignore_index=True) + + if (log_path is not None) and (i % 1000 == 0): + performance.to_csv(log_path, index=False) + + if start_loss is None: + start_loss = l.item() + loss_rel_change = (l.item() - start_loss) / start_loss + + l.backward() + iterator.set_postfix( + {"loss": l.item(), "loss_rel_change": loss_rel_change} + ) + optimizer.step() + if log_path is not None: performance.to_csv(log_path, index=False) - return torch.exp(weights).detach().numpy() + gates.eval() + final_weights_sparse = (torch.exp(weights) * gates()).detach().numpy() + compute_diagnostics(final_weights_sparse, "Sparse") + + return final_weights_sparse def train_previous_year_income_model(): @@ -253,26 +354,6 @@ def generate(self): ) data["household_weight"][year] = optimised_weights - print("\n\n---reweighting quick diagnostics----\n") - estimate = optimised_weights @ loss_matrix_clean - rel_error = ( - ((estimate - targets_array_clean) + 1) - / (targets_array_clean + 1) - ) ** 2 - print( - f"rel_error: min: {np.min(rel_error):.2f}, " - f"max: {np.max(rel_error):.2f} " - f"mean: {np.mean(rel_error):.2f}, " - f"median: {np.median(rel_error):.2f}" - ) - print("Relative error over 100% for:") - for i in np.where(rel_error > 1)[0]: - print(f"target_name: {loss_matrix_clean.columns[i]}") - print(f"target_value: {targets_array_clean[i]}") - print(f"estimate_value: {estimate[i]}") - print(f"has rel_error: {rel_error[i]:.2f}\n") - print("---End of reweighting quick diagnostics------") - self.save_dataset(data) @@ -336,8 +417,18 @@ def generate(self): bad_mask = loss_matrix.columns.isin(bad_targets) keep_mask_bool = ~bad_mask keep_idx = np.where(keep_mask_bool)[0] - loss_matrix_clean = loss_matrix.iloc[:, keep_idx] - targets_array_clean = targets_array[keep_idx] + + # Check if filtering would remove all columns + if len(keep_idx) == 0: + print( + "WARNING: bad_targets filtering would remove all columns, using all columns instead" + ) + keep_idx = np.arange(loss_matrix.shape[1]) + targets_array_clean = targets_array + loss_matrix_clean = loss_matrix + else: + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_array_clean = targets_array[keep_idx] assert loss_matrix_clean.shape[1] == targets_array_clean.size from policyengine_us_data.utils.minimize import ( @@ -358,6 +449,52 @@ def generate(self): ) +class SparseEnhancedCPS_2024(EnhancedCPS): + input_dataset = ExtendedCPS_2024 + start_year = 2024 + end_year = 2024 + name = "sparse_enhanced_cps_2024" + label = "Sparse Enhanced CPS 2024" + file_path = STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5" + url = "hf://policyengine/policyengine-us-data/sparse_enhanced_cps_2024.h5" + + def generate(self): + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=self.input_dataset) + data = sim.dataset.load_dataset() + data["household_weight"] = {} + original_weights = sim.calculate("household_weight") + original_weights = original_weights.values + np.random.normal( + 1, 0.1, len(original_weights) + ) + + # Run the optimization procedure to get (close to) minimum loss weights + for year in range(self.start_year, self.end_year + 1): + loss_matrix, targets_array = build_loss_matrix( + self.input_dataset, year + ) + + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_array_clean = targets_array[keep_idx] + assert loss_matrix_clean.shape[1] == targets_array_clean.size + + optimised_weights = reweight( + original_weights, + loss_matrix_clean, + targets_array_clean, + log_path="calibration_log.csv", + epochs=150, + sparse=True, + ) + data["household_weight"][year] = optimised_weights + + self.save_dataset(data) + + class EnhancedCPS_2024(EnhancedCPS): input_dataset = ExtendedCPS_2024 start_year = 2024 @@ -369,5 +506,6 @@ class EnhancedCPS_2024(EnhancedCPS): if __name__ == "__main__": - EnhancedCPS_2024().generate() - MinimizedEnhancedCPS_2024().generate() + # EnhancedCPS_2024().generate() + # MinimizedEnhancedCPS_2024().generate() + SparseEnhancedCPS_2024().generate() diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py index 976725d9..9e8d697c 100644 --- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py @@ -1,5 +1,8 @@ +import pandas as pd import numpy as np +from policyengine_core.data.dataset import Dataset + def create_small_ecps(): from policyengine_us import Microsimulation @@ -37,6 +40,111 @@ def create_small_ecps(): grp.create_dataset(str(period), data=values) +def create_sparse_ecps(): + from policyengine_us import Microsimulation + from policyengine_us_data.datasets import SparseEnhancedCPS_2024 + from policyengine_us_data.storage import STORAGE_FOLDER + from policyengine_core.enums import Enum + + time_period = 2024 + + ecps = SparseEnhancedCPS_2024() + h5 = ecps.load() + sparse_weights = h5["household_sparse_weight"]["2024"][:] + hh_ids = h5["household_id"]["2024"][:] + + template_sim = Microsimulation( + dataset=EnhancedCPS_2024, + ) + template_sim.set_input("household_weight", 2024, sparse_weights) + + template_df = template_sim.to_input_dataframe() + + household_weight_column = f"household_weight__{time_period}" + df_household_id_column = f"household_id__{time_period}" + df_person_id_column = f"person_id__{time_period}" + + # Group by household ID and get the first entry for each group + df = template_df + h_df = df.groupby(df_household_id_column).first() + h_ids = pd.Series(h_df.index) + h_weights = pd.Series(h_df[household_weight_column].values) + + # Seed the random number generators for reproducibility + h_ids = h_ids[h_weights > 0] + h_weights = h_weights[h_weights > 0] + + subset_df = df[df[df_household_id_column].isin(h_ids)].copy() + + household_id_to_count = {} + for household_id in h_ids: + if household_id not in household_id_to_count: + household_id_to_count[household_id] = 0 + household_id_to_count[household_id] += 1 + + household_counts = subset_df[df_household_id_column].map( + lambda x: household_id_to_count.get(x, 0) + ) + + # NOTE: from subsample. I don't think I want to do this! + ## Adjust household weights to maintain the total weight + # for col in subset_df.columns: + # if "weight__" in col: + # target_total_weight = df[col].values.sum() + # if not quantize_weights: + # subset_df[col] *= household_counts.values + # else: + # subset_df[col] = household_counts.values + # subset_df[col] *= ( + # target_total_weight / subset_df[col].values.sum() + # ) + + df = subset_df + + # Update the dataset and rebuild the simulation + sim = Microsimulation() + sim.dataset = Dataset.from_dataframe(df, sim.dataset.time_period) + sim.build_from_dataset() + + # Ensure the baseline branch has the new data. + if "baseline" in sim.branches: + baseline_tax_benefit_system = sim.branches[ + "baseline" + ].tax_benefit_system + sim.branches["baseline"] = sim.clone() + sim.branches["tax_benefit_system"] = baseline_tax_benefit_system + + sim.default_calculation_period = time_period + + # Get ready to write it out + simulation = sim + data = {} + for variable in simulation.tax_benefit_system.variables: + data[variable] = {} + for time_period in simulation.get_holder(variable).get_known_periods(): + values = simulation.get_holder(variable).get_array(time_period) + values = np.array(values) + if simulation.tax_benefit_system.variables.get( + variable + ).value_type in (Enum, str): + values = values.astype("S") + if values is not None: + data[variable][time_period] = values + + if len(data[variable]) == 0: + del data[variable] + + import h5py + + with h5py.File(STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5", "w") as f: + for variable, periods in data.items(): + grp = f.create_group(variable) + for period, values in periods.items(): + grp.create_dataset(str(period), data=values) + + if __name__ == "__main__": create_small_ecps() print("Small CPS dataset created successfully.") + create_sparse_ecps() + print("Sparse CPS dataset created successfully.") diff --git a/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py new file mode 100644 index 00000000..b807c1ef --- /dev/null +++ b/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py @@ -0,0 +1,85 @@ +import pytest + +import numpy as np + +from policyengine_us_data.utils import build_loss_matrix + + +def test_sparse_ecps(): + from policyengine_core.data import Dataset + from policyengine_us_data.storage import STORAGE_FOLDER + from policyengine_us import Microsimulation + + # NOTE: replace with "small_enhanced_cps_2024.h5 to see the difference! + sim = Microsimulation( + dataset=Dataset.from_file( + STORAGE_FOLDER / f"sparse_enhanced_cps_2024.h5", + ) + ) + + data = sim.dataset.load_dataset() + bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "state/RI/adjusted_gross_income/amount/-inf_1", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "state/RI/adjusted_gross_income/amount/-inf_1", + "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All", + ] + + year = 2024 + loss_matrix, targets_array = build_loss_matrix(sim.dataset, year) + zero_mask = np.isclose(targets_array, 0.0, atol=0.1) + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~(zero_mask | bad_mask) + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_array_clean = targets_array[keep_idx] + assert loss_matrix_clean.shape[1] == targets_array_clean.size + + optimised_weights = data["household_weight"]["2024"] + print("\n\n---Sparse Solutions: reweighting quick diagnostics----\n") + print( + f"{np.sum(optimised_weights == 0)} are zero, {np.sum(optimised_weights != 0)} weights are nonzero" + ) + estimate = optimised_weights @ loss_matrix_clean + rel_error = ( + ((estimate - targets_array_clean) + 1) / (targets_array_clean + 1) + ) ** 2 + within_10_percent_mask = np.abs(estimate - targets_array_clean) <= ( + 0.10 * np.abs(targets_array_clean) + ) + percent_within_10 = np.mean(within_10_percent_mask) * 100 + print( + f"rel_error: min: {np.min(rel_error):.2f}\n" + f"max: {np.max(rel_error):.2f}\n" + f"mean: {np.mean(rel_error):.2f}\n" + f"median: {np.median(rel_error):.2f}\n" + f"Wthin 10% of target: {percent_within_10:.2f}%" + ) + print("Relative error over 100% for:") + for i in np.where(rel_error > 1)[0]: + print(f"target_name: {loss_matrix_clean.columns[i]}") + print(f"target_value: {targets_array_clean[i]}") + print(f"estimate_value: {estimate[i]}") + print(f"has rel_error: {rel_error[i]:.2f}\n") + print("---End of reweighting quick diagnostics------") + + assert percent_within_10 > 70.0 + + +if __name__ == "main": + test_sparse_ecps() diff --git a/policyengine_us_data/utils/__init__.py b/policyengine_us_data/utils/__init__.py index d25c6c2f..136d2503 100644 --- a/policyengine_us_data/utils/__init__.py +++ b/policyengine_us_data/utils/__init__.py @@ -3,3 +3,4 @@ from .uprating import * from .loss import * from .qrf import * +from .l0 import * diff --git a/policyengine_us_data/utils/l0.py b/policyengine_us_data/utils/l0.py new file mode 100644 index 00000000..ebd89d0a --- /dev/null +++ b/policyengine_us_data/utils/l0.py @@ -0,0 +1,208 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class HardConcrete(nn.Module): + """HardConcrete distribution for L0 regularization.""" + + def __init__( + self, + input_dim, + output_dim=None, + temperature=0.5, + stretch=0.1, + init_mean=0.5, + ): + super().__init__() + if output_dim is None: + self.gate_size = (input_dim,) + else: + self.gate_size = (input_dim, output_dim) + self.qz_logits = nn.Parameter(torch.zeros(self.gate_size)) + self.temperature = temperature + self.stretch = stretch + self.gamma = -0.1 + self.zeta = 1.1 + self.init_mean = init_mean + self.reset_parameters() + + def reset_parameters(self): + if self.init_mean is not None: + init_val = math.log(self.init_mean / (1 - self.init_mean)) + self.qz_logits.data.fill_(init_val) + + def forward(self, input_shape=None): + if self.training: + gates = self._sample_gates() + else: + gates = self._deterministic_gates() + if input_shape is not None and len(input_shape) > len(gates.shape): + gates = gates.unsqueeze(-1).unsqueeze(-1) + return gates + + def _sample_gates(self): + u = torch.zeros_like(self.qz_logits).uniform_(1e-8, 1.0 - 1e-8) + s = torch.log(u) - torch.log(1 - u) + self.qz_logits + s = torch.sigmoid(s / self.temperature) + s = s * (self.zeta - self.gamma) + self.gamma + gates = torch.clamp(s, 0, 1) + return gates + + def _deterministic_gates(self): + probs = torch.sigmoid(self.qz_logits) + gates = probs * (self.zeta - self.gamma) + self.gamma + return torch.clamp(gates, 0, 1) + + def get_penalty(self): + logits_shifted = self.qz_logits - self.temperature * math.log( + -self.gamma / self.zeta + ) + prob_active = torch.sigmoid(logits_shifted) + return prob_active.sum() + + def get_active_prob(self): + logits_shifted = self.qz_logits - self.temperature * math.log( + -self.gamma / self.zeta + ) + return torch.sigmoid(logits_shifted) + + +class L0Linear(nn.Module): + """Linear layer with L0 regularization using HardConcrete gates.""" + + def __init__( + self, + in_features, + out_features, + bias=True, + temperature=0.5, + init_sparsity=0.5, + ): + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.weight = nn.Parameter(torch.Tensor(out_features, in_features)) + if bias: + self.bias = nn.Parameter(torch.Tensor(out_features)) + else: + self.register_parameter("bias", None) + self.weight_gates = HardConcrete( + out_features, + in_features, + temperature=temperature, + init_mean=init_sparsity, + ) + self.reset_parameters() + + def reset_parameters(self): + nn.init.kaiming_normal_(self.weight, mode="fan_out") + if self.bias is not None: + nn.init.zeros_(self.bias) + + def forward(self, input): + gates = self.weight_gates() + masked_weight = self.weight * gates + return F.linear(input, masked_weight, self.bias) + + def get_l0_penalty(self): + return self.weight_gates.get_penalty() + + def get_sparsity(self): + with torch.no_grad(): + prob_active = self.weight_gates.get_active_prob() + return 1.0 - prob_active.mean().item() + + +class SparseMLP(nn.Module): + """Example MLP with L0 regularization on all layers""" + + def __init__( + self, + input_dim=784, + hidden_dim=256, + output_dim=10, + init_sparsity=0.5, + temperature=0.5, + ): + super().__init__() + self.fc1 = L0Linear( + input_dim, + hidden_dim, + init_sparsity=init_sparsity, + temperature=temperature, + ) + self.fc2 = L0Linear( + hidden_dim, + hidden_dim, + init_sparsity=init_sparsity, + temperature=temperature, + ) + self.fc3 = L0Linear( + hidden_dim, + output_dim, + init_sparsity=init_sparsity, + temperature=temperature, + ) + + def forward(self, x): + x = x.view(x.size(0), -1) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + def get_l0_loss(self): + l0_loss = 0 + for module in self.modules(): + if isinstance(module, L0Linear): + l0_loss += module.get_l0_penalty() + return l0_loss + + def get_sparsity_stats(self): + stats = {} + for name, module in self.named_modules(): + if isinstance(module, L0Linear): + stats[name] = { + "sparsity": module.get_sparsity(), + "active_params": module.get_l0_penalty().item(), + } + return stats + + +def train_with_l0(model, train_loader, epochs=10, l0_lambda=1e-3): + optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) + criterion = nn.CrossEntropyLoss() + for epoch in range(epochs): + total_loss = 0 + total_l0 = 0 + for batch_idx, (data, target) in enumerate(train_loader): + optimizer.zero_grad() + output = model(data) + ce_loss = criterion(output, target) + l0_loss = model.get_l0_loss() + loss = ce_loss + l0_lambda * l0_loss + loss.backward() + optimizer.step() + total_loss += ce_loss.item() + total_l0 += l0_loss.item() + if epoch % 1 == 0: + sparsity_stats = model.get_sparsity_stats() + print( + f"Epoch {epoch}: Loss={total_loss/len(train_loader):.4f}, L0={total_l0/len(train_loader):.4f}" + ) + for layer, stats in sparsity_stats.items(): + print( + f" {layer}: {stats['sparsity']*100:.1f}% sparse, {stats['active_params']:.1f} active params" + ) + + +def prune_model(model, threshold=0.05): + for module in model.modules(): + if isinstance(module, L0Linear): + with torch.no_grad(): + prob_active = module.weight_gates.get_active_prob() + mask = (prob_active > threshold).float() + module.weight.data *= mask + return model From 5ae89d5a515ca08113a8202928011008fcb32871 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 16:57:51 +0200 Subject: [PATCH 53/56] update datasets to be generated --- policyengine_us_data/datasets/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/policyengine_us_data/datasets/__init__.py b/policyengine_us_data/datasets/__init__.py index 87461837..c0f2c8fd 100644 --- a/policyengine_us_data/datasets/__init__.py +++ b/policyengine_us_data/datasets/__init__.py @@ -14,6 +14,8 @@ CensusCPS_2023, EnhancedCPS_2024, ReweightedCPS_2024, + MinimizedEnhancedCPS_2024, + SparseEnhancedCPS_2024, ) from .puf import PUF_2015, PUF_2021, PUF_2024, IRS_PUF_2015 from .acs import ACS_2022 From 0521be64c5faf3736e24dfb8e4f29109d8bfb1d4 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 17:17:32 +0200 Subject: [PATCH 54/56] try adding logic to generate the sparse ecps if missing --- .../datasets/cps/enhanced_cps.py | 4 ++++ .../datasets/cps/small_enhanced_cps.py | 17 ++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 38e9fad0..d246a89d 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -491,6 +491,10 @@ def generate(self): sparse=True, ) data["household_weight"][year] = optimised_weights + # Also save as sparse weights for small_enhanced_cps.py + if "household_sparse_weight" not in data: + data["household_sparse_weight"] = {} + data["household_sparse_weight"][year] = optimised_weights self.save_dataset(data) diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py index 9e8d697c..db13b770 100644 --- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py @@ -49,9 +49,20 @@ def create_sparse_ecps(): time_period = 2024 ecps = SparseEnhancedCPS_2024() - h5 = ecps.load() - sparse_weights = h5["household_sparse_weight"]["2024"][:] - hh_ids = h5["household_id"]["2024"][:] + + # Check if sparse weights exist, if not generate them + try: + h5 = ecps.load() + sparse_weights = h5["household_sparse_weight"]["2024"][:] + hh_ids = h5["household_id"]["2024"][:] + except KeyError: + print( + "Sparse weights not found. Generating SparseEnhancedCPS_2024 dataset..." + ) + ecps.generate() + h5 = ecps.load() + sparse_weights = h5["household_sparse_weight"]["2024"][:] + hh_ids = h5["household_id"]["2024"][:] template_sim = Microsimulation( dataset=EnhancedCPS_2024, From 1d38077b0ee938709647acf4626dc6b24ed72a12 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 17:35:44 +0200 Subject: [PATCH 55/56] make saving minimized ECPS optional --- .github/workflows/code_changes.yaml | 1 + .github/workflows/pr_code_changes.yaml | 1 + policyengine_us_data/datasets/cps/enhanced_cps.py | 7 ++++++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index 908dd887..6c619b40 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -61,6 +61,7 @@ jobs: with: name: minimized_enhanced_cps_2024_calibration_log.csv path: minimized_enhanced_cps_2024_calibration_log.csv + if-no-files-found: ignore - name: Run tests run: pytest - name: Upload data diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 678d7d0d..4c2d6cbf 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -88,6 +88,7 @@ jobs: with: name: minimized_enhanced_cps_2024_calibration_log.csv path: minimized_enhanced_cps_2024_calibration_log.csv + if-no-files-found: ignore - name: Run tests run: pytest diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index d246a89d..984308bc 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -460,6 +460,9 @@ class SparseEnhancedCPS_2024(EnhancedCPS): def generate(self): from policyengine_us import Microsimulation + from policyengine_us_data.utils.minimize import ( + create_calibration_log_file, + ) sim = Microsimulation(dataset=self.input_dataset) data = sim.dataset.load_dataset() @@ -498,6 +501,8 @@ def generate(self): self.save_dataset(data) + create_calibration_log_file(self.file_path) + class EnhancedCPS_2024(EnhancedCPS): input_dataset = ExtendedCPS_2024 @@ -510,6 +515,6 @@ class EnhancedCPS_2024(EnhancedCPS): if __name__ == "__main__": - # EnhancedCPS_2024().generate() + EnhancedCPS_2024().generate() # MinimizedEnhancedCPS_2024().generate() SparseEnhancedCPS_2024().generate() From 6bb0fb17dcd8bb017810f236c1d82a9e54ddaf86 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Tue, 15 Jul 2025 18:10:25 +0200 Subject: [PATCH 56/56] reducing iterations hoping jobs dont get killed --- policyengine_us_data/utils/minimize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/utils/minimize.py b/policyengine_us_data/utils/minimize.py index 8575470a..ce2c6fdf 100644 --- a/policyengine_us_data/utils/minimize.py +++ b/policyengine_us_data/utils/minimize.py @@ -167,7 +167,7 @@ def candidate_loss_contribution( targets: np.ndarray, normalisation_factor: np.ndarray, loss_rel_change_max: float, - count_iterations: int = 10, + count_iterations: int = 5, view_fraction_per_iteration: float = 0.5, fraction_remove_per_iteration: float = 0.05, ) -> np.ndarray: