diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..ac664753 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Enhanced CPS minimizing tests. \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 7a471d40..2fbb0293 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -22,13 +22,27 @@ torch = None +bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", +] + + def reweight( original_weights, loss_matrix, targets_array, dropout_rate=0.05, - log_path="calibration_log.csv", epochs=150, + log_path="calibration_log.csv", + penalty_approach=None, + penalty_weight=None, ): target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") @@ -46,8 +60,12 @@ def reweight( np.log(original_weights), requires_grad=True, dtype=torch.float32 ) - # TODO: replace this functionality from the microcalibrate package. - def loss(weights): + # TO DO: replace this with a call to the python reweight.py package. + def loss( + weights, + penalty_approach=penalty_approach, + penalty_weight=penalty_weight, + ): # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") @@ -60,9 +78,51 @@ def loss(weights): ((estimate - targets_array) + 1) / (targets_array + 1) ) ** 2 rel_error_normalized = rel_error * normalisation_factor + if torch.isnan(rel_error_normalized).any(): raise ValueError("Relative error contains NaNs") - return rel_error_normalized.mean() + + if penalty_approach is not None and penalty_weight is not None: + # L0 penalty (approximated with smooth function) + # Since L0 is non-differentiable, we use a smooth approximation + # Common approaches: + + epsilon = 1e-3 # Threshold for "near zero" + + # Option 1: Sigmoid approximation + if penalty_approach == "l0_sigmoid": + smoothed_l0 = torch.sigmoid( + (weights - epsilon) / (epsilon * 0.1) + ).mean() + + # Option 2: Log-sum penalty (smoother) + if penalty_approach == "l0_log": + smoothed_l0 = torch.log(1 + weights / epsilon).sum() / len( + weights + ) + + # Option 3: Exponential penalty + if penalty_approach == "l0_exp": + smoothed_l0 = (1 - torch.exp(-weights / epsilon)).mean() + + if penalty_approach == "l1": + l1 = torch.mean(weights) + return rel_error_normalized.mean() + penalty_weight * l1 + + return rel_error_normalized.mean() + penalty_weight * smoothed_l0 + + else: + return rel_error_normalized.mean() + + def prune_dataset(weights, epsilon=1e-3): + """ + Prune dataset samples based on learned weights. + Returns indices of samples to keep. + """ + importance_scores = weights.detach().cpu().numpy() + keep_indices = np.where(importance_scores > epsilon)[0] + + return keep_indices def dropout_weights(weights, p): if p == 0: @@ -207,9 +267,9 @@ def generate(self): loss_matrix, targets_array = build_loss_matrix( self.input_dataset, year ) - zero_mask = np.isclose(targets_array, 0.0, atol=0.1) + bad_mask = loss_matrix.columns.isin(bad_targets) - keep_mask_bool = ~(zero_mask | bad_mask) + keep_mask_bool = ~bad_mask keep_idx = np.where(keep_mask_bool)[0] loss_matrix_clean = loss_matrix.iloc[:, keep_idx] targets_array_clean = targets_array[keep_idx] @@ -220,7 +280,7 @@ def generate(self): loss_matrix_clean, targets_array_clean, log_path="calibration_log.csv", - epochs=150, + epochs= 150, ) data["household_weight"][year] = optimised_weights diff --git a/policyengine_us_data/storage/upload_completed_datasets.py b/policyengine_us_data/storage/upload_completed_datasets.py index f161a9ee..16885d8c 100644 --- a/policyengine_us_data/storage/upload_completed_datasets.py +++ b/policyengine_us_data/storage/upload_completed_datasets.py @@ -15,6 +15,7 @@ def upload_datasets(): Pooled_3_Year_CPS_2023.file_path, CPS_2023.file_path, STORAGE_FOLDER / "small_enhanced_cps_2024.h5", + STORAGE_FOLDER / "enhanced_cps_2024_minified.h5", ] for file_path in dataset_files: diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 21abce0f..fbdbacef 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -552,11 +552,6 @@ def build_loss_matrix(dataset: type, time_period): # Convert to thousands for the target targets_array.append(row["enrollment"]) - print( - f"Targeting Medicaid enrollment for {row['state']} " - f"with target {row['enrollment']:.0f}k" - ) - # State 10-year age targets age_targets = pd.read_csv(STORAGE_FOLDER / "age_state.csv") diff --git a/policyengine_us_data/utils/minimize.py b/policyengine_us_data/utils/minimize.py new file mode 100644 index 00000000..ce2c6fdf --- /dev/null +++ b/policyengine_us_data/utils/minimize.py @@ -0,0 +1,444 @@ +from policyengine_us_data.utils.loss import build_loss_matrix +from policyengine_core.data import Dataset +from policyengine_us import Microsimulation +import numpy as np +import pandas as pd +import h5py +from policyengine_us_data.storage import STORAGE_FOLDER +from typing import Optional, Callable + +bad_targets = [ + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household", + "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household", + "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse", + "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse", +] + + +def create_calibration_log_file(file_path, epoch=0): + dataset = Dataset.from_file(file_path) + sim = Microsimulation(dataset=dataset) + + loss_matrix, targets = build_loss_matrix(dataset, 2024) + + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + + assert loss_matrix_clean.shape[1] == targets_clean.size + + estimates = ( + sim.calculate("household_weight", 2024).values @ loss_matrix_clean + ) + target_names = loss_matrix_clean.columns + + # Calculate and print some key metrics + errors = estimates - targets_clean + rel_errors = errors / targets_clean + + df = pd.DataFrame( + { + "target_name": target_names, + "estimate": estimates, + "target": targets_clean, + } + ) + df["epoch"] = epoch + df["error"] = df["estimate"] - df["target"] + df["rel_error"] = df["error"] / df["target"] + df["abs_error"] = df["error"].abs() + df["rel_abs_error"] = ( + df["abs_error"] / df["target"].abs() + if df["target"].abs().sum() > 0 + else np.nan + ) + df["loss"] = (df["rel_error"] ** 2).mean() + + df.to_csv( + str(file_path).replace(".h5", "_calibration_log.csv"), index=False + ) + + +def losses_for_candidates( + base_weights: np.ndarray, + idxs: np.ndarray, + est_mat: np.ndarray, + targets: np.ndarray, + norm: np.ndarray, + chunk_size: Optional[int] = 25_000, +) -> np.ndarray: + """ + Return the loss value *for each* candidate deletion in `idxs` + in one matrix multiplication. + + Parameters + ---------- + base_weights : (n,) original weight vector + idxs : (k,) candidate row indices to zero-out + est_mat : (n, m) estimate matrix + targets : (m,) calibration targets + norm : (m,) normalisation factors + chunk_size : max number of candidates to process at once + + Returns + ------- + losses : (k,) loss if row i were removed (and weights rescaled) + """ + W = base_weights + total = W.sum() + k = len(idxs) + losses = np.empty(k, dtype=float) + + # Work through the candidate list in blocks + for start in range(0, k, chunk_size): + stop = min(start + chunk_size, k) + part = idxs[start:stop] # (p,) where p ≤ chunk_size + p = len(part) + + # Build the delta matrix only for this chunk + delta = np.zeros((p, len(W))) + delta[np.arange(p), part] = -W[part] + + keep_total = total + delta.sum(axis=1) # (p,) + delta *= (total / keep_total)[:, None] + + # Matrix–matrix multiply → one matrix multiplication per chunk + ests = (W + delta) @ est_mat # (p, m) + rel_err = ((ests - targets) + 1) / (targets + 1) + losses[start:stop] = ((rel_err * norm) ** 2).mean(axis=1) + + return losses + + +def get_loss_from_mask( + weights, inclusion_mask, estimate_matrix, targets, normalisation_factor +): + """ + Calculate the loss based on the inclusion mask and the estimate matrix. + """ + # Step 1: Apply mask and rescale weights + masked_weights = weights.copy() + original_weight_total = masked_weights.sum() + if (~inclusion_mask).sum() > 0: + masked_weights[~inclusion_mask] = 0 + masked_weight_total = masked_weights.sum() + masked_weights[inclusion_mask] *= ( + original_weight_total / masked_weight_total + ) + + # Step 2: Re-calibrate the masked weights to hit targets + # Only calibrate the included households + included_weights = masked_weights[inclusion_mask] + included_estimate_matrix = estimate_matrix.iloc[ + inclusion_mask + ] # Keep as DataFrame + + # Call reweight function to calibrate the selected households + from policyengine_us_data.datasets.cps.enhanced_cps import reweight + + calibrated_weights_included = reweight( + included_weights, + included_estimate_matrix, + targets, + epochs=250, + ) + + # Put calibrated weights back into full array + calibrated_weights = np.zeros_like(masked_weights) + calibrated_weights[inclusion_mask] = calibrated_weights_included + + # Calculate estimates and loss from calibrated weights + estimates = calibrated_weights @ estimate_matrix + rel_error = ((estimates - targets) + 1) / (targets + 1) + loss = ((rel_error * normalisation_factor) ** 2).mean() + + return loss + + +def candidate_loss_contribution( + weights: np.ndarray, + estimate_matrix: np.ndarray, + targets: np.ndarray, + normalisation_factor: np.ndarray, + loss_rel_change_max: float, + count_iterations: int = 5, + view_fraction_per_iteration: float = 0.5, + fraction_remove_per_iteration: float = 0.05, +) -> np.ndarray: + """ + Minimization approach based on candidate loss contribution. + + This function iteratively removes households that contribute least to the loss, + maintaining the calibration quality within the specified tolerance. + + Parameters + ---------- + weights : (n,) household weights + estimate_matrix : (n, m) matrix mapping weights to estimates + targets : (m,) calibration targets + normalisation_factor : (m,) normalisation factors for different targets + loss_rel_change_max : maximum allowed relative change in loss + count_iterations : number of iterations to perform + view_fraction_per_iteration : fraction of households to evaluate each iteration + fraction_remove_per_iteration : fraction of households to remove each iteration + + Returns + ------- + inclusion_mask : (n,) boolean mask of households to keep + """ + from tqdm import tqdm + + full_mask = np.ones_like(weights, dtype=bool) + + for i in range(count_iterations): + inclusion_mask = full_mask.copy() + baseline_loss = get_loss_from_mask( + weights, + inclusion_mask, + estimate_matrix, + targets, + normalisation_factor, + ) + + # Sample only households that are currently included + indices = np.random.choice( + np.where(full_mask)[0], + size=int(full_mask.sum() * view_fraction_per_iteration), + replace=False, + ) + # 2. compute losses for the batch in one shot + candidate_losses = losses_for_candidates( + weights, indices, estimate_matrix, targets, normalisation_factor + ) + # 3. convert to relative change vs. baseline + household_loss_rel_changes = ( + candidate_losses - baseline_loss + ) / baseline_loss + + inclusion_mask = full_mask.copy() + household_loss_rel_changes = np.array(household_loss_rel_changes) + # Sort by the relative change in loss + sorted_indices = np.argsort(household_loss_rel_changes) + + # Remove the worst households + num_to_remove = int(len(weights) * fraction_remove_per_iteration) + worst_indices = indices[sorted_indices[:num_to_remove]] + inclusion_mask[worst_indices] = False + + # Calculate the new loss + new_loss = get_loss_from_mask( + weights, + inclusion_mask, + estimate_matrix, + targets, + normalisation_factor, + ) + rel_change = (new_loss - baseline_loss) / baseline_loss + + if rel_change > loss_rel_change_max: + print( + f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}, " + f"which is too high ({rel_change:.2%}). Stopping." + ) + break + + print( + f"Iteration {i + 1}: Loss changed from {baseline_loss} to {new_loss}" + ) + print( + f"Removed {num_to_remove} households with worst relative loss changes." + ) + + # Update the full mask + full_mask &= inclusion_mask + + return full_mask + + +def random_sampling_minimization( + weights, + estimate_matrix, + targets, + normalisation_factor, + random=True, + target_fractions=[0.5, 0.6, 0.7, 0.8, 0.9], +): + """A simple random sampling approach""" + n = len(weights) + + household_weights_normalized = weights / weights.sum() + + final_mask = None + lowest_loss = float("inf") + for fraction in target_fractions: + target_size = int(n * fraction) + # Random sampling with multiple attempts + best_mask = None + best_loss = float("inf") + + for _ in range(3): # Try 3 random samples + mask = np.zeros(n, dtype=bool) + mask[ + np.random.choice( + n, + target_size, + p=household_weights_normalized if random else None, + replace=False, + ) + ] = True + + loss = get_loss_from_mask( + weights, mask, estimate_matrix, targets, normalisation_factor + ) + + if loss < best_loss: + best_loss = loss + best_mask = mask + + if lowest_loss > best_loss: + lowest_loss = best_loss + final_mask = best_mask + + return final_mask + + +def minimize_dataset( + dataset, + output_path: str, + minimization_function: Callable = candidate_loss_contribution, + loss_matrix: Optional[pd.DataFrame] = None, + targets: Optional[np.ndarray] = None, + **kwargs, +) -> None: + """ + Main function to minimize a dataset using a specified minimization approach. + + Parameters + ---------- + dataset : path to the dataset file or Dataset object + output_path : path where the minimized dataset will be saved + loss_rel_change_max : maximum allowed relative change in loss + minimization_function : function that implements the minimization logic + **kwargs : additional arguments to pass to the minimization function + """ + # Handle both dataset class and file path + if hasattr(dataset, "file_path"): + dataset_path = str(dataset.file_path) + else: + dataset_path = str(dataset) + + create_calibration_log_file(dataset_path) + + dataset = Dataset.from_file(dataset_path) + if loss_matrix is None or targets is None: + loss_matrix, targets = build_loss_matrix(dataset, 2024) + + bad_mask = loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + loss_matrix_clean = loss_matrix.iloc[:, keep_idx] + targets_clean = targets[keep_idx] + assert loss_matrix_clean.shape[1] == targets_clean.size + else: + loss_matrix_clean = loss_matrix + targets_clean = targets + + sim = Microsimulation(dataset=dataset) + + weights = sim.calculate("household_weight", 2024).values + is_national = loss_matrix_clean.columns.str.startswith("nation/") + nation_normalisation_factor = is_national * (1 / is_national.sum()) + state_normalisation_factor = ~is_national * (1 / (~is_national).sum()) + normalisation_factor = np.where( + is_national, nation_normalisation_factor, state_normalisation_factor + ) + + # Call the minimization function + inclusion_mask = minimization_function( + weights=weights, + estimate_matrix=loss_matrix_clean, + targets=targets_clean, + normalisation_factor=normalisation_factor, + **kwargs, # Allows for passing either loss_rel_change_max OR target_fractions, depending on normalisation_factor choice. + ) + + # Extract household IDs for remaining households + household_ids = sim.calculate("household_id", 2024).values + remaining_households = household_ids[inclusion_mask] + + # Create a smaller dataset with only the remaining households + df = sim.to_input_dataframe() + smaller_df = df[df["household_id__2024"].isin(remaining_households)] + + weight_rel_change = ( + smaller_df["household_weight__2024"].sum() + / df["household_weight__2024"].sum() + ) + print(f"Weight relative change: {weight_rel_change:.2%}") + + # Create new simulation with smaller dataset + sim = Microsimulation(dataset=smaller_df) + + # Rescale weights to maintain total + initial_weights = ( + sim.calculate("household_weight", 2024).values / weight_rel_change + ) + + # Re-calibrate the final selected households to hit targets + print("Re-calibrating final selected households...") + + # Build loss matrix for the smaller dataset + smaller_loss_matrix, smaller_targets = build_loss_matrix(sim.dataset, 2024) + + # Apply same filtering as before + bad_mask = smaller_loss_matrix.columns.isin(bad_targets) + keep_mask_bool = ~bad_mask + keep_idx = np.where(keep_mask_bool)[0] + smaller_loss_matrix_clean = smaller_loss_matrix.iloc[:, keep_idx] + smaller_targets_clean = smaller_targets[keep_idx] + + from policyengine_us_data.datasets.cps.enhanced_cps import reweight + + calibrated_weights = reweight( + initial_weights, + smaller_loss_matrix_clean, # Now matches the smaller dataset size + smaller_targets_clean, + epochs=250, # Reduced epochs for faster processing + ) + sim.set_input("household_weight", 2024, calibrated_weights) + print("Final calibration completed successfully") + # Prepare data for saving + data = {} + for variable in sim.input_variables: + data[variable] = {2024: sim.calculate(variable, 2024).values} + if data[variable][2024].dtype == "object": + data[variable][2024] = data[variable][2024].astype("S") + + # Save to HDF5 file + with h5py.File(output_path, "w") as f: + for variable, values in data.items(): + for year, value in values.items(): + f.create_dataset(f"{variable}/{year}", data=value) + + print(f"Saved minimised dataset to {output_path}") + create_calibration_log_file(output_path, epoch=250) + + +if __name__ == "__main__": + # Example usage + files = [ + STORAGE_FOLDER / "enhanced_cps_2024.h5", + ] + + for file in files: + output_path = file.with_name(file.stem + "_minimised.h5") + minimize_dataset( + file, + output_path, + ) diff --git a/test_minimization_approach.ipynb b/test_minimization_approach.ipynb new file mode 100644 index 00000000..a4bd87be --- /dev/null +++ b/test_minimization_approach.ipynb @@ -0,0 +1,1634 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "037988b0", + "metadata": {}, + "source": [ + "# Testing Arena for Different Regularization Strategies" + ] + }, + { + "cell_type": "markdown", + "id": "268ab898", + "metadata": {}, + "source": [ + "#### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "d6dc9cca", + "metadata": {}, + "outputs": [], + "source": [ + "from policyengine_us_data.utils.minimize import minimize_dataset, random_sampling_minimization, candidate_loss_contribution\n", + "from policyengine_us_data.storage import STORAGE_FOLDER\n", + "from policyengine_us import Microsimulation\n", + "from policyengine_us_data.datasets.cps.enhanced_cps import reweight, ExtendedCPS_2024\n", + "from policyengine_us_data.utils import build_loss_matrix\n", + "import numpy as np\n", + "import os\n", + "import h5py\n", + "import pandas as pd\n", + "import plotly.express as px\n", + "\n", + "\n", + "bad_targets = [\n", + " \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household\",\n", + " \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household\",\n", + " \"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n", + " \"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n", + " \"nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household\",\n", + " \"nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household\",\n", + " \"nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse\",\n", + " \"nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "683fd57e", + "metadata": {}, + "outputs": [], + "source": [ + "# Length of household entity in the dataset measured through household_weight:\n", + "\n", + "# Original ECPS 2024 dataset size: 41310\n", + "# Through \"random_sampling_minimization\" with 0.5 of the dataset being pruned: 20655\n", + "# Through \"random_sampling_minimization\" with 0.2 of the dataset being pruned: 33408\n", + "# After minimization through \"candidate_loss_contribution\" and a 1.0 max error change: 20655 \n", + "# After minimization through \"candidate_loss_contribution\" and a 0.001 max error change: 24786" + ] + }, + { + "cell_type": "markdown", + "id": "e99994d3", + "metadata": {}, + "source": [ + "## Enhanced_CPS_2024.py Regularization Approaches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db975ac1", + "metadata": {}, + "outputs": [], + "source": [ + "## ALL TESTS\n", + "\n", + "## For L1 and L0 penalty approaches which are integrated into the enhanced CPS dataset creation\n", + "input_dataset = ExtendedCPS_2024\n", + "\n", + "approaches = [\"l0_sigmoid\"]#, \"l0_log\", \"l0_exp\", \"l1\"]\n", + "penalty_weights = [1e-5]#, 1e-4, 1e-3, 1e-2, 1e-1]\n", + "\n", + "def get_output_path(approach, file_name):\n", + " output_path = STORAGE_FOLDER / approach / file_name\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + " return output_path\n", + "\n", + "results = []\n", + "\n", + "for approach in approaches:\n", + " for penalty_weight in penalty_weights:\n", + " # Storing files in correct locations\n", + " cal_log_name = f\"calibration_log_{approach}_{penalty_weight}.csv\"\n", + " h5_name = f\"enhanced_cps_2024_{approach}_{penalty_weight}_minimised.h5\"\n", + " cal_log_path = get_output_path(approach, cal_log_name)\n", + " h5_path = get_output_path(approach, h5_name)\n", + "\n", + " sim = Microsimulation(dataset=input_dataset)\n", + " data = sim.dataset.load_dataset()\n", + " data[\"household_weight\"] = {}\n", + " original_weights = sim.calculate(\"household_weight\")\n", + " original_weights = original_weights.values + np.random.normal(\n", + " 1, 0.1, len(original_weights)\n", + " )\n", + " for year in range(2024, 2025):\n", + " loss_matrix, targets_array = build_loss_matrix(\n", + " input_dataset, year\n", + " )\n", + "\n", + " bad_mask = loss_matrix.columns.isin(bad_targets)\n", + " keep_mask_bool = ~bad_mask\n", + " keep_idx = np.where(keep_mask_bool)[0]\n", + " loss_matrix_clean = loss_matrix.iloc[:, keep_idx]\n", + " targets_array_clean = targets_array[keep_idx]\n", + " assert loss_matrix_clean.shape[1] == targets_array_clean.size\n", + "\n", + " optimised_weights = reweight(\n", + " original_weights,\n", + " loss_matrix_clean,\n", + " targets_array_clean,\n", + " log_path=cal_log_path, \n", + " penalty_approach=approach,\n", + " penalty_weight=penalty_weight, \n", + " epochs=250, # Reduced epochs for faster processing\n", + " )\n", + " keep_indices = prune_dataset(optimised_weights, epsilon=1e-3)\n", + " pruned_weights = optimised_weights[keep_indices]\n", + " \n", + " data[\"household_weight\"][year] = pruned_weights\n", + "\n", + " # Save to HDF5 file\n", + " with h5py.File(h5_path, \"w\") as f:\n", + " for variable, values in data.items():\n", + " for year, value in values.items():\n", + " f.create_dataset(f\"{variable}/{year}\", data=value)" + ] + }, + { + "cell_type": "markdown", + "id": "69ff392d", + "metadata": {}, + "source": [ + "## Minimize.py Regularization Approaches" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "aeab67b3", + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[17], line 31\u001b[0m\n\u001b[1;32m 29\u001b[0m output_path \u001b[38;5;241m=\u001b[39m STORAGE_FOLDER \u001b[38;5;241m/\u001b[39m approach \u001b[38;5;241m/\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_enhanced_cps_2024_minimised.h5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 30\u001b[0m output_path\u001b[38;5;241m.\u001b[39mparent\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 31\u001b[0m \u001b[43mminimize_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 32\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 33\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 34\u001b[0m \u001b[43m \u001b[49m\u001b[43mminimization_function\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mminimization_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 35\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_fractions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 36\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m params \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloss_rel_change_max\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m files:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:336\u001b[0m, in \u001b[0;36mminimize_dataset\u001b[0;34m(dataset, output_path, minimization_function, loss_matrix, targets, **kwargs)\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 334\u001b[0m dataset_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(dataset)\n\u001b[0;32m--> 336\u001b[0m \u001b[43mcreate_calibration_log_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 338\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(dataset_path)\n\u001b[1;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m loss_matrix \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m targets \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/minimize.py:26\u001b[0m, in \u001b[0;36mcreate_calibration_log_file\u001b[0;34m(file_path, epoch)\u001b[0m\n\u001b[1;32m 23\u001b[0m dataset \u001b[38;5;241m=\u001b[39m Dataset\u001b[38;5;241m.\u001b[39mfrom_file(file_path)\n\u001b[1;32m 24\u001b[0m sim \u001b[38;5;241m=\u001b[39m Microsimulation(dataset\u001b[38;5;241m=\u001b[39mdataset)\n\u001b[0;32m---> 26\u001b[0m loss_matrix, targets \u001b[38;5;241m=\u001b[39m \u001b[43mbuild_loss_matrix\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2024\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 28\u001b[0m bad_mask \u001b[38;5;241m=\u001b[39m loss_matrix\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39misin(bad_targets)\n\u001b[1;32m 29\u001b[0m keep_mask_bool \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m~\u001b[39mbad_mask\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/policyengine_us_data/utils/loss.py:243\u001b[0m, in \u001b[0;36mbuild_loss_matrix\u001b[0;34m(dataset, time_period)\u001b[0m\n\u001b[1;32m 241\u001b[0m \u001b[38;5;66;03m# National ACA Spending\u001b[39;00m\n\u001b[1;32m 242\u001b[0m label \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnation/gov/aca_spending\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 243\u001b[0m loss_matrix[label] \u001b[38;5;241m=\u001b[39m \u001b[43msim\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 244\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maca_ptc\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhousehold\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2025\u001b[39;49m\n\u001b[1;32m 245\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 246\u001b[0m ACA_SPENDING_2024 \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m9.8e10\u001b[39m \u001b[38;5;66;03m# 2024 outlays on PTC\u001b[39;00m\n\u001b[1;32m 247\u001b[0m targets_array\u001b[38;5;241m.\u001b[39mappend(ACA_SPENDING_2024)\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 1003\u001b[0m array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/aca/ptc/aca_ptc.py:14\u001b[0m, in \u001b[0;36maca_ptc.formula\u001b[0;34m(tax_unit, period, parameters)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mformula\u001b[39m(tax_unit, period, parameters):\n\u001b[0;32m---> 14\u001b[0m plan_cost \u001b[38;5;241m=\u001b[39m \u001b[43mtax_unit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mslcsp\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 15\u001b[0m income \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maca_magi\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m 16\u001b[0m applicable_figure \u001b[38;5;241m=\u001b[39m tax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maca_ptc_phase_out_rate\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/group_population.py:38\u001b[0m, in \u001b[0;36mGroupPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msum(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmembers(variable_name, period, options))\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 38\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/populations/population.py:137\u001b[0m, in \u001b[0;36mPopulation.__call__\u001b[0;34m(self, variable_name, period, options)\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msimulation\u001b[38;5;241m.\u001b[39mcalculate_divide(\n\u001b[1;32m 134\u001b[0m variable_name, period, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcalculate_kwargs\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msimulation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcalculate_kwargs\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:681\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 679\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_calculate(variable_name, contained_months[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m])\n\u001b[1;32m 680\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 681\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate_add\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 682\u001b[0m alternate_period_handling \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 683\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m variable\u001b[38;5;241m.\u001b[39mdefinition_period \u001b[38;5;241m==\u001b[39m YEAR \u001b[38;5;129;01mand\u001b[39;00m period\u001b[38;5;241m.\u001b[39munit \u001b[38;5;241m==\u001b[39m MONTH:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:67\u001b[0m, in \u001b[0;36mMicrosimulation.calculate_add\u001b[0;34m(self, variable_name, period, map_to, use_weights)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mcalculate_add\u001b[39m(\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 62\u001b[0m variable_name: \u001b[38;5;28mstr\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 65\u001b[0m use_weights: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 66\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m MicroSeries:\n\u001b[0;32m---> 67\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate_add\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 68\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 69\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:846\u001b[0m, in \u001b[0;36mSimulation.calculate_add\u001b[0;34m(self, variable_name, period, decode_enums)\u001b[0m\n\u001b[1;32m 835\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable\u001b[38;5;241m.\u001b[39mdefinition_period \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m [\n\u001b[1;32m 836\u001b[0m periods\u001b[38;5;241m.\u001b[39mDAY,\n\u001b[1;32m 837\u001b[0m periods\u001b[38;5;241m.\u001b[39mMONTH,\n\u001b[1;32m 838\u001b[0m periods\u001b[38;5;241m.\u001b[39mYEAR,\n\u001b[1;32m 839\u001b[0m ]:\n\u001b[1;32m 840\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 841\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to sum constant variable \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m over period \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m: only variables defined daily, monthly, or yearly can be summed over time.\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m 842\u001b[0m variable\u001b[38;5;241m.\u001b[39mname, period\n\u001b[1;32m 843\u001b[0m )\n\u001b[1;32m 844\u001b[0m )\n\u001b[0;32m--> 846\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(\n\u001b[1;32m 847\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcalculate(variable_name, sub_period)\n\u001b[1;32m 848\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sub_period \u001b[38;5;129;01min\u001b[39;00m period\u001b[38;5;241m.\u001b[39mget_subperiods(variable\u001b[38;5;241m.\u001b[39mdefinition_period)\n\u001b[1;32m 849\u001b[0m )\n\u001b[1;32m 850\u001b[0m holder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_holder(variable\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m 851\u001b[0m holder\u001b[38;5;241m.\u001b[39mput_in_cache(result, period, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbranch_name)\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:847\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 835\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m variable\u001b[38;5;241m.\u001b[39mdefinition_period \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m [\n\u001b[1;32m 836\u001b[0m periods\u001b[38;5;241m.\u001b[39mDAY,\n\u001b[1;32m 837\u001b[0m periods\u001b[38;5;241m.\u001b[39mMONTH,\n\u001b[1;32m 838\u001b[0m periods\u001b[38;5;241m.\u001b[39mYEAR,\n\u001b[1;32m 839\u001b[0m ]:\n\u001b[1;32m 840\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 841\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to sum constant variable \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m over period \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m: only variables defined daily, monthly, or yearly can be summed over time.\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m 842\u001b[0m variable\u001b[38;5;241m.\u001b[39mname, period\n\u001b[1;32m 843\u001b[0m )\n\u001b[1;32m 844\u001b[0m )\n\u001b[1;32m 846\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(\n\u001b[0;32m--> 847\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msub_period\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 848\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sub_period \u001b[38;5;129;01min\u001b[39;00m period\u001b[38;5;241m.\u001b[39mget_subperiods(variable\u001b[38;5;241m.\u001b[39mdefinition_period)\n\u001b[1;32m 849\u001b[0m )\n\u001b[1;32m 850\u001b[0m holder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_holder(variable\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m 851\u001b[0m holder\u001b[38;5;241m.\u001b[39mput_in_cache(result, period, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbranch_name)\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:940\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 938\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m adds_list:\n\u001b[1;32m 939\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m added_variable \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtax_benefit_system\u001b[38;5;241m.\u001b[39mvariables:\n\u001b[0;32m--> 940\u001b[0m values \u001b[38;5;241m=\u001b[39m values \u001b[38;5;241m+\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 941\u001b[0m \u001b[43m \u001b[49m\u001b[43madded_variable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mentity\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkey\u001b[49m\n\u001b[1;32m 942\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 943\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 944\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/microsimulation.py:54\u001b[0m, in \u001b[0;36mMicrosimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, use_weights, decode_enums)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m period \u001b[38;5;241m=\u001b[39m get_period(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_calculation_period)\n\u001b[0;32m---> 54\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcalculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_to\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_enums\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m use_weights:\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m values\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:485\u001b[0m, in \u001b[0;36mSimulation.calculate\u001b[0;34m(self, variable_name, period, map_to, decode_enums)\u001b[0m\n\u001b[1;32m 482\u001b[0m np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mseed(\u001b[38;5;28mhash\u001b[39m(variable_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(period)) \u001b[38;5;241m%\u001b[39m \u001b[38;5;241m1000000\u001b[39m)\n\u001b[1;32m 484\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 485\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_calculate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, EnumArray) \u001b[38;5;129;01mand\u001b[39;00m decode_enums:\n\u001b[1;32m 487\u001b[0m result \u001b[38;5;241m=\u001b[39m result\u001b[38;5;241m.\u001b[39mdecode_to_str()\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:715\u001b[0m, in \u001b[0;36mSimulation._calculate\u001b[0;34m(self, variable_name, period)\u001b[0m\n\u001b[1;32m 713\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_for_cycle(variable\u001b[38;5;241m.\u001b[39mname, period)\n\u001b[0;32m--> 715\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_formula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;66;03m# If no result, use the default value and cache it\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m array \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;66;03m# Check if the variable has a previously defined value\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_core/simulations/simulation.py:1005\u001b[0m, in \u001b[0;36mSimulation._run_formula\u001b[0;34m(self, variable, population, period)\u001b[0m\n\u001b[1;32m 1003\u001b[0m array \u001b[38;5;241m=\u001b[39m formula(population, period)\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1005\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mformula\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpopulation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparameters_at\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array\n", + "File \u001b[0;32m~/Desktop/PolicyEngine/policyengine-us-data/.venv/lib/python3.11/site-packages/policyengine_us/variables/gov/aca/slspc/slcsp_age_curve_amount_person.py:27\u001b[0m, in \u001b[0;36mslcsp_age_curve_amount_person.formula\u001b[0;34m(person, period, parameters)\u001b[0m\n\u001b[1;32m 19\u001b[0m p \u001b[38;5;241m=\u001b[39m parameters(period)\u001b[38;5;241m.\u001b[39mgov\u001b[38;5;241m.\u001b[39maca\u001b[38;5;241m.\u001b[39mage_curves\n\u001b[1;32m 21\u001b[0m \u001b[38;5;66;03m# Handle other states with regular bracket structures\u001b[39;00m\n\u001b[1;32m 22\u001b[0m multiplier \u001b[38;5;241m=\u001b[39m select(\n\u001b[1;32m 23\u001b[0m [\n\u001b[1;32m 24\u001b[0m state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAL\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 25\u001b[0m state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDC\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 26\u001b[0m state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMA\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m---> 27\u001b[0m \u001b[43mstate_code\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mMN\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m,\n\u001b[1;32m 28\u001b[0m state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMS\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 29\u001b[0m state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOR\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 30\u001b[0m state_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUT\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 31\u001b[0m ],\n\u001b[1;32m 32\u001b[0m [\n\u001b[1;32m 33\u001b[0m p\u001b[38;5;241m.\u001b[39mal\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 34\u001b[0m p\u001b[38;5;241m.\u001b[39mdc\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 35\u001b[0m p\u001b[38;5;241m.\u001b[39mma\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 36\u001b[0m p\u001b[38;5;241m.\u001b[39mmn\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 37\u001b[0m p\u001b[38;5;241m.\u001b[39mms\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 38\u001b[0m p[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 39\u001b[0m p\u001b[38;5;241m.\u001b[39mut\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 40\u001b[0m ],\n\u001b[1;32m 41\u001b[0m default\u001b[38;5;241m=\u001b[39mp\u001b[38;5;241m.\u001b[39mdefault\u001b[38;5;241m.\u001b[39mcalc(age),\n\u001b[1;32m 42\u001b[0m )\n\u001b[1;32m 43\u001b[0m age_curve_applies \u001b[38;5;241m=\u001b[39m person\u001b[38;5;241m.\u001b[39mtax_unit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mslcsp_age_curve_applies\u001b[39m\u001b[38;5;124m\"\u001b[39m, period)\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m base_cost \u001b[38;5;241m*\u001b[39m multiplier \u001b[38;5;241m*\u001b[39m age_curve_applies\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "## For approaches external to the reweighting approach implemented in enhanced CPS dataset creation\n", + "\n", + "files = [\n", + " STORAGE_FOLDER / \"enhanced_cps_2024.h5\",\n", + " ]\n", + "\n", + "approaches = {\n", + " \"random_sampling_minimization\": random_sampling_minimization,\n", + " \"candidate_loss_contribution\": candidate_loss_contribution,\n", + "}\n", + "\n", + "optional_params = {\n", + " \"random_sampling_minimization\": {\n", + " \"target_fractions\": [0.7, 0.8, 0.9]#, 0.5, 0.6]], # fractions of the dataset to keep\n", + " },\n", + " \"candidate_loss_contribution\": {\n", + " \"loss_rel_change_max\": [0.00001, 0.000001, 0.0000001]#, 0.001, 0.0001]] # maximum relative change in loss\n", + " }\n", + "}\n", + "\n", + "for approach, function in approaches.items():\n", + " minimization_function = function\n", + " # other minimization function approach is \"random_sampling_minimization\", for which you can specify the tolerance for loss relative change.\n", + "\n", + " for params, values in optional_params[approach].items():\n", + " for value in values:\n", + " if params == \"target_fractions\":\n", + " for file in files:\n", + " output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + " minimize_dataset(\n", + " file,\n", + " output_path,\n", + " minimization_function=minimization_function, \n", + " target_fractions=[value]\n", + " )\n", + " elif params == \"loss_rel_change_max\":\n", + " for file in files:\n", + " output_path = STORAGE_FOLDER / approach / f\"{value}_enhanced_cps_2024_minimised.h5\"\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + " minimize_dataset(\n", + " file,\n", + " output_path,\n", + " minimization_function=minimization_function, \n", + " loss_rel_change_max=value\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "8568b5ca", + "metadata": {}, + "source": [ + "## Visualization of Results\n", + "Calibration logs can also be shown in María's Vercel dashboard" + ] + }, + { + "cell_type": "markdown", + "id": "f8b0fe2e", + "metadata": {}, + "source": [ + "### Data Scrape for Plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "225debd8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
strategyparameterdataset_sizetotal_loss
0nonenone413100.0069
\n", + "
" + ], + "text/plain": [ + " strategy parameter dataset_size total_loss\n", + "0 none none 41310 0.0069" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"\n", + "Creating scoring of loss\n", + "Creating dataframe to store regularization results\n", + "\"\"\"\n", + "\n", + "\n", + "def get_output_path(approach, file_name):\n", + " output_path = STORAGE_FOLDER / approach / file_name\n", + " output_path.parent.mkdir(parents=True, exist_ok=True)\n", + " return output_path\n", + "\n", + "# Calculate quality categories\n", + "def loss_score(calibration_log):\n", + " excellent_count = (\n", + " calibration_log[\"rel_abs_error\"] < 0.05).sum() # < 5% error\n", + " good_count = (\n", + " (calibration_log[\"rel_abs_error\"] >= 0.05)\n", + " & (calibration_log[\"rel_abs_error\"] < 0.20)).sum() # 5-20% error\n", + " total_targets = len(calibration_log)\n", + " # Calculate quality score\n", + " quality_score = (excellent_count * 100 + good_count * 75) / total_targets\n", + " return quality_score\n", + "\n", + "\n", + "\n", + "# Initial dataframe setup\n", + "reg_results_df = pd.DataFrame({\n", + " 'strategy': ['none'],\n", + " 'parameter': ['none'],\n", + " 'dataset_size': [41310],\n", + " 'total_loss': [6.9e-3]\n", + "})\n", + "\n", + "def add_result(df, strategy, parameter, dataset_size, total_loss):\n", + " new_rows = pd.DataFrame({\n", + " 'strategy': strategy, \n", + " 'parameter': parameter, \n", + " 'dataset_size': [dataset_size],\n", + " 'total_loss': [total_loss]\n", + " })\n", + " return pd.concat([reg_results_df, new_rows], ignore_index=True)\n", + "\n", + "# Example usage\n", + "#reg_results_df = add_result(reg_results_df, ['L1', 'L2'], ['0.001','0.002'], [35000, 4000], [7.2e-3, 7.2e-3])\n", + "reg_results_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bb3ef3c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
strategyparameterdataset_sizetotal_loss
0nonenone413100.006900
1candidate_loss_contribution1.0413100.006900
2random_sampling_minimization0.52065580.882353
3random_sampling_minimization0.62478679.117647
4random_sampling_minimization1.0413100.006900
5candidate_loss_contribution0.0014131077.647059
6candidate_loss_contribution0.00014131080.196078
\n", + "
" + ], + "text/plain": [ + " strategy parameter dataset_size total_loss\n", + "0 none none 41310 0.006900\n", + "1 candidate_loss_contribution 1.0 41310 0.006900\n", + "2 random_sampling_minimization 0.5 20655 80.882353\n", + "3 random_sampling_minimization 0.6 24786 79.117647\n", + "4 random_sampling_minimization 1.0 41310 0.006900\n", + "5 candidate_loss_contribution 0.001 41310 77.647059\n", + "6 candidate_loss_contribution 0.0001 41310 80.196078" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"\n", + "Scraping values from created calibration_log.csv and .h5 files to populate the plotting dataframe\n", + "\"\"\"\n", + "\n", + "og_size = 41310 # Original size of the dataset\n", + "og_loss = 6.9e-3 # Original loss from the baseline dataset\n", + "\n", + "approaches = [\"l0_sigmoid\", \"l0_log\", \"l0_exp\", \"l1\"]\n", + "penalty_weights = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]\n", + "\n", + "for approach in approaches:\n", + " strategy = approach\n", + " reg_results_df = add_result(reg_results_df, strategy, 1.0, og_size, og_loss)\n", + " for penalty_weight in penalty_weights:\n", + " parameter = penalty_weight\n", + "\n", + " # Pull length of .h5 file\n", + " h5_name = f\"enhanced_cps_2024_{strategy}_{parameter}_minimised.h5\"\n", + " h5_path = get_output_path(strategy, h5_name)\n", + " dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n", + "\n", + " # Pull score of loss column\n", + " cal_log_name = f\"calibration_log_{strategy}_{parameter}.csv\"\n", + " cal_log_path = get_output_path(strategy, cal_log_name)\n", + " calibration_log = pd.read_csv(cal_log_path)\n", + " loss_value = loss_score(calibration_log)\n", + " \n", + " reg_results_df = add_result(reg_results_df, strategy, parameter, dataset_size, loss_value)\n", + "\n", + "approaches = {\n", + " \"random_sampling_minimization\":[0.5, 0.6], #, 0.7, 0.8, 0.9], \n", + " \"candidate_loss_contribution\": [0.001, 0.0001] #, 0.00001, 0.000001, 0.0000001],\n", + "}\n", + "\n", + "for approach, fractions in approaches.items(): # Use .items() to get key-value pairs\n", + " reg_results_df = add_result(reg_results_df, strategy, 1.0, og_size, og_loss)\n", + " for fraction in fractions:\n", + " strategy = approach\n", + " parameter = fraction\n", + "\n", + " # Pull length of .h5 file\n", + " h5_name = f\"{fraction}_enhanced_cps_2024_minimised.h5\"\n", + " h5_path = STORAGE_FOLDER / strategy / h5_name\n", + " dataset_size = len(h5py.File(h5_path, \"r\")['household_weight/2024'])\n", + "\n", + " # Pull sum of loss column\n", + " cal_log_name = f\"{fraction}_enhanced_cps_2024_minimised_calibration_log.csv\"\n", + " cal_log_path = get_output_path(strategy, cal_log_name)\n", + " calibration_log = pd.read_csv(cal_log_path)\n", + " loss_value = loss_score(calibration_log)\n", + "\n", + " reg_results_df = add_result(reg_results_df, strategy, parameter, dataset_size, loss_value)\n", + "\n", + "reg_results_df\n" + ] + }, + { + "cell_type": "markdown", + "id": "5b203ccd", + "metadata": {}, + "source": [ + "### Plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "2dc0891c", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "plotlyServerURL": "https://plot.ly" + }, + "data": [ + { + "customdata": [ + [ + "candidate_loss_contribution" + ], + [ + "candidate_loss_contribution" + ], + [ + "candidate_loss_contribution" + ] + ], + "hovertemplate": "Strategy: %{customdata[0]}
Size: %{x}
Loss: %{y:.4f}
Param: %{text}", + "legendgroup": "candidate_loss_contribution", + "line": { + "color": "#636efa", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines+markers+text", + "name": "candidate_loss_contribution", + "orientation": "v", + "showlegend": true, + "text": [ + "1.0", + "0.001", + "0.0001" + ], + "textposition": "top center", + "type": "scatter", + "x": [ + 41310, + 41310, + 41310 + ], + "xaxis": "x", + "y": [ + 0.0069, + 77.6470588235294, + 80.19607843137256 + ], + "yaxis": "y" + }, + { + "customdata": [ + [ + "random_sampling_minimization" + ], + [ + "random_sampling_minimization" + ], + [ + "random_sampling_minimization" + ] + ], + "hovertemplate": "Strategy: %{customdata[0]}
Size: %{x}
Loss: %{y:.4f}
Param: %{text}", + "legendgroup": "random_sampling_minimization", + "line": { + "color": "#EF553B", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines+markers+text", + "name": "random_sampling_minimization", + "orientation": "v", + "showlegend": true, + "text": [ + "1.0", + "0.6", + "0.5" + ], + "textposition": "top center", + "type": "scatter", + "x": [ + 41310, + 24786, + 20655 + ], + "xaxis": "x", + "y": [ + 0.0069, + 79.11764705882354, + 80.88235294117646 + ], + "yaxis": "y" + } + ], + "layout": { + "annotations": [ + { + "arrowhead": 1, + "ax": 40, + "ay": -40, + "font": { + "color": "gray" + }, + "showarrow": true, + "text": "Baseline", + "x": 41310, + "y": 0.0069 + } + ], + "height": 600, + "hovermode": "closest", + "legend": { + "title": { + "text": "Strategy" + }, + "tracegroupgap": 0 + }, + "shapes": [ + { + "line": { + "color": "gray", + "dash": "dash" + }, + "name": "Baseline Size", + "type": "line", + "x0": 41310, + "x1": 41310, + "y0": 0.0069, + "y1": 80.88235294117646 + }, + { + "line": { + "color": "gray", + "dash": "dash" + }, + "name": "Baseline Loss", + "type": "line", + "x0": 20655, + "x1": 41310, + "y0": 0.0069, + "y1": 0.0069 + } + ], + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "fillpattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "ECPS Regularization Strategy Comparison" + }, + "width": 900, + "xaxis": { + "anchor": "y", + "domain": [ + 0, + 1 + ], + "title": { + "text": "Number of Households" + } + }, + "yaxis": { + "anchor": "x", + "domain": [ + 0, + 1 + ], + "title": { + "text": "Calibration Score" + } + } + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\"\"\"\n", + "Creating a multi-line plot with plotly\n", + "\"\"\"\n", + "\n", + "# Filter out the baseline row\n", + "df_plot = reg_results_df[reg_results_df['strategy'] != 'none'].copy()\n", + "df_plot['parameter'] = df_plot['parameter'].astype(str)\n", + "df_plot = df_plot.sort_values(by=['strategy', 'dataset_size'], ascending=[True, False])\n", + "\n", + "# Create line plot\n", + "fig = px.line(\n", + " df_plot,\n", + " x=\"dataset_size\",\n", + " y=\"total_loss\",\n", + " color=\"strategy\",\n", + " markers=True,\n", + " text=\"parameter\",\n", + " custom_data=[\"strategy\"],\n", + " title=\"ECPS Regularization Strategy Comparison\",\n", + " labels={\n", + " \"dataset_size\": \"Number of Households\",\n", + " \"total_loss\": \"Calibration Score\",\n", + " \"strategy\": \"Regularization Approach\"\n", + " }\n", + ")\n", + "\n", + "# Add text labels (parameter) on hover\n", + "fig.update_traces(\n", + " textposition=\"top center\", \n", + " hovertemplate=(\n", + " \"Strategy: %{customdata[0]}
\"\n", + " \"Size: %{x}
\"\n", + " \"Loss: %{y:.4f}
\"\n", + " \"Param: %{text}\"\n", + " )\n", + ")\n", + "\n", + "# Add baseline lines\n", + "baseline = reg_results_df[reg_results_df['strategy'] == 'none'].iloc[0]\n", + "\n", + "fig.add_shape(\n", + " type=\"line\",\n", + " x0=baseline[\"dataset_size\"], x1=baseline[\"dataset_size\"],\n", + " y0=df_plot[\"total_loss\"].min(), y1=df_plot[\"total_loss\"].max(),\n", + " line=dict(color=\"gray\", dash=\"dash\"),\n", + " name=\"Baseline Size\"\n", + ")\n", + "\n", + "fig.add_shape(\n", + " type=\"line\",\n", + " x0=df_plot[\"dataset_size\"].min(), x1=df_plot[\"dataset_size\"].max(),\n", + " y0=baseline[\"total_loss\"], y1=baseline[\"total_loss\"],\n", + " line=dict(color=\"gray\", dash=\"dash\"),\n", + " name=\"Baseline Loss\"\n", + ")\n", + "\n", + "# Add annotation for the baseline\n", + "fig.add_annotation(\n", + " x=baseline[\"dataset_size\"],\n", + " y=baseline[\"total_loss\"],\n", + " text=\"Baseline\",\n", + " showarrow=True,\n", + " arrowhead=1,\n", + " ax=40,\n", + " ay=-40,\n", + " font=dict(color=\"gray\"),\n", + ")\n", + "\n", + "# Final layout adjustments\n", + "fig.update_layout(\n", + " legend_title=\"Strategy\",\n", + " hovermode=\"closest\",\n", + " width=900,\n", + " height=600\n", + ")\n", + "\n", + "fig.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "policyengine-us-data", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}