diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..73fe68eb 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,5 @@ +- bump: patch + changes: + changed: + - renamed "ucgid" to "ucgid_str" in age targets loading script and operation to "in" + - removed [0.5] key access from imputation results as per microimpute's new output format \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 1431396d..57530c5d 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -186,14 +186,14 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame): imputed_values = fitted_model.predict(X_test=inference_df) logging.info("Imputation complete.") cps["rent"] = np.zeros_like(cps["age"]) - cps["rent"][mask] = imputed_values[0.5]["rent"] + cps["rent"][mask] = imputed_values["rent"] # Assume zero housing assistance since cps["pre_subsidy_rent"] = cps["rent"] cps["housing_assistance"] = np.zeros_like( cps["spm_unit_capped_housing_subsidy_reported"] ) cps["real_estate_taxes"] = np.zeros_like(cps["age"]) - cps["real_estate_taxes"][mask] = imputed_values[0.5]["real_estate_taxes"] + cps["real_estate_taxes"][mask] = imputed_values["real_estate_taxes"] def add_takeup(self): @@ -1618,7 +1618,7 @@ def add_tips(self, cps: h5py.File): cps["tip_income"] = model.predict( X_test=cps, mean_quantile=0.5, - )[0.5].tip_income.values + ).tip_income.values self.save_dataset(cps) @@ -1957,7 +1957,7 @@ def determine_reference_person(group): imputations = fitted_model.predict(X_test=receiver_data) for var in IMPUTED_VARIABLES: - cps[var] = imputations[0.5][var] + cps[var] = imputations[var] cps["net_worth"] = cps["networth"] del cps["networth"] diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 782280a3..f28c726c 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -299,7 +299,7 @@ def impute_income_variables( # Extract median predictions and add to result for var in batch_vars: - result[var] = batch_predictions[0.5][var] + result[var] = batch_predictions[var] # Clean up batch objects del fitted_model diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index 37d353a1..cac9ad61 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -189,8 +189,7 @@ def impute_pension_contributions_to_puf(puf_df): # Predict using the fitted model predictions = fitted_model.predict(X_test=puf_df[["employment_income"]]) - # Return the median (0.5 quantile) predictions - return predictions[0.5]["pre_tax_contributions"] + return predictions["pre_tax_contributions"] def impute_missing_demographics( @@ -242,12 +241,10 @@ def impute_missing_demographics( ].reset_index() # Predict demographics - predictions = fitted_model.predict( + predicted_demographics = fitted_model.predict( X_test=puf_without_demographics[NON_DEMOGRAPHIC_VARIABLES] ) - # Get median predictions - predicted_demographics = predictions[0.5] puf_with_imputed_demographics = pd.concat( [puf_without_demographics, predicted_demographics], axis=1 ) diff --git a/policyengine_us_data/db/load_age_targets.py b/policyengine_us_data/db/load_age_targets.py index b93c7687..f42adcf3 100644 --- a/policyengine_us_data/db/load_age_targets.py +++ b/policyengine_us_data/db/load_age_targets.py @@ -174,18 +174,18 @@ def transform_age_data(age_data, docs): ) df = df.drop(columns="NAME") - df = df.rename({"GEO_ID": "ucgid"}, axis=1) - df_data = df.rename(columns=rename_mapping)[["ucgid"] + list(AGE_COLS)] + df = df.rename({"GEO_ID": "ucgid_str"}, axis=1) + df_data = df.rename(columns=rename_mapping)[["ucgid_str"] + list(AGE_COLS)] # Filter out Puerto Rico's district and state records, if needed df_geos = df_data[ - ~df_data["ucgid"].isin(["5001800US7298", "0400000US72"]) + ~df_data["ucgid_str"].isin(["5001800US7298", "0400000US72"]) ].copy() - df = df_geos[["ucgid"] + AGE_COLS] + df = df_geos[["ucgid_str"] + AGE_COLS] df_long = df.melt( - id_vars="ucgid", + id_vars="ucgid_str", value_vars=AGE_COLS, var_name="age_range", value_name="value", @@ -212,11 +212,11 @@ def load_age_data(df_long, geo, stratum_lookup={}): # Quick data quality check before loading ---- if geo == "National": - assert len(set(df_long.ucgid)) == 1 + assert len(set(df_long.ucgid_str)) == 1 elif geo == "State": - assert len(set(df_long.ucgid)) == 51 + assert len(set(df_long.ucgid_str)) == 51 elif geo == "District": - assert len(set(df_long.ucgid)) == 436 + assert len(set(df_long.ucgid_str)) == 436 else: raise ValueError('geo must be one of "National", "State", "District"') @@ -238,7 +238,7 @@ def load_age_data(df_long, geo, stratum_lookup={}): # Create the parent Stratum object. # We will attach children to it before adding it to the session. - note = f"Age: {row['age_range']}, Geo: {row['ucgid']}" + note = f"Age: {row['age_range']}, Geo: {row['ucgid_str']}" parent_geo = get_parent_geo(geo) parent_stratum_id = ( stratum_lookup[parent_geo][row["age_range"]] @@ -253,9 +253,9 @@ def load_age_data(df_long, geo, stratum_lookup={}): # Create constraints and link them to the parent's relationship attribute. new_stratum.constraints_rel = [ StratumConstraint( - constraint_variable="ucgid", - operation="equals", - value=row["ucgid"], + constraint_variable="ucgid_str", + operation="in", + value=row["ucgid_str"], ), StratumConstraint( constraint_variable="age",