From c94efad3c916f7599966f29e0243e00a5daf7b17 Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Mon, 17 Feb 2025 13:11:05 +0100 Subject: [PATCH 1/6] Impute pregnancy Fixes #162 --- policyengine_us_data/datasets/cps/cps.py | 17 ++++++++++++++++- policyengine_us_data/utils/loss.py | 15 +++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 9771b43f..e7eba502 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -229,8 +229,23 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None: person.A_AGE == 80, # NB: randint is inclusive of first argument, exclusive of second. np.random.randint(80, 85, len(person)), - person.A_AGE, + np.where( + person.A_AGE == 0, + np.where( + np.random.randint(0, 2, len(person)), # Random number of 0 or 1 + # If 1 is flipped, select a random number between -0.75 and 0 + # This will represent the pregnany month + # At -0.75 the pregnancy month is 0 and at -0.0001 the pregnancy month is 9 + np.random.uniform(-0.75, 0, len(person)), + # If 0 is flipped, the child is a newborn at the age of 0 to 1 + np.random.uniform(0, 1, len(person)), + ), + person.A_AGE, + ) ) + cps["is_pregnant"] = (cps["age"] >= -0.75) & (cps["age"] < 0) + cps["is_newborn"] = (cps["age"] >= 0) & (cps["age"] < 1) + # A_SEX is 1 -> male, 2 -> female. cps["is_female"] = person.A_SEX == 2 # "Is...blind or does...have serious difficulty seeing even when Wearing diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index a01b16a4..2c576c5e 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -254,6 +254,8 @@ def build_loss_matrix(dataset: type, time_period): # Rough estimate, not CPS derived "real_estate_taxes": 400e9, # Rough estimate between 350bn and 600bn total property tax collections "rent": 735e9, # ACS total uprated by CPI + "is_newborn": 3_491_679, # ACS total of people aged 0 + "is_pregnant": 2_618_759, # 75% of the ACS total of people aged 0 } for variable_name, target in HARD_CODED_TOTALS.items(): @@ -340,6 +342,19 @@ def build_loss_matrix(dataset: type, time_period): ) targets_array.append(row["population_under_5"]) + # Population by number of newborns and pregancies + + age = sim.calculate("age").values + newborns = (age >= 0) & (age < 1) + label = "census/newborns" + loss_matrix[label] = sim.map_result(newborns, "person", "household") + targets_array.append(HARD_CODED_TOTALS["is_newborn"]) + + pregnancies = (age >= -0.75) & (age < 0) + label = "census/pregnancies" + loss_matrix[label] = sim.map_result(pregnancies, "person", "household") + targets_array.append(HARD_CODED_TOTALS["is_pregnant"]) + if any(loss_matrix.isna().sum() > 0): raise ValueError("Some targets are missing from the loss matrix") From 978e0fe0df02038ba0f26a32eb20233f8c4f4d93 Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Mon, 17 Feb 2025 16:34:10 +0100 Subject: [PATCH 2/6] refactor and add test --- policyengine_us_data/datasets/cps/cps.py | 36 ++++++++++++------- .../tests/test_datasets/test_enhanced_cps.py | 16 +++++++++ 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index e7eba502..a223c0c6 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -229,7 +229,24 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None: person.A_AGE == 80, # NB: randint is inclusive of first argument, exclusive of second. np.random.randint(80, 85, len(person)), - np.where( + person.A_AGE, + ) + + + # A_SEX is 1 -> male, 2 -> female. + cps["is_female"] = person.A_SEX == 2 + # "Is...blind or does...have serious difficulty seeing even when Wearing + # glasses?" 1 -> Yes + cps["is_blind"] = person.PEDISEYE == 1 + DISABILITY_FLAGS = [ + "PEDIS" + i for i in ["DRS", "EAR", "EYE", "OUT", "PHY", "REM"] + ] + cps["is_disabled"] = (person[DISABILITY_FLAGS] == 1).any(axis=1) + + + def _assign_some_newborns_to_pregnancy(age: pd.Series, person: pd.DataFrame) -> pd.Series: + """ Takes an array of ages, returns the new age array with the given percentage of newborns assigned a negative age (in pregnancy).""" + age = np.where( person.A_AGE == 0, np.where( np.random.randint(0, 2, len(person)), # Random number of 0 or 1 @@ -242,19 +259,11 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None: ), person.A_AGE, ) - ) - cps["is_pregnant"] = (cps["age"] >= -0.75) & (cps["age"] < 0) - cps["is_newborn"] = (cps["age"] >= 0) & (cps["age"] < 1) + return age + + + cps["age"] = _assign_some_newborns_to_pregnancy(cps["age"]) - # A_SEX is 1 -> male, 2 -> female. - cps["is_female"] = person.A_SEX == 2 - # "Is...blind or does...have serious difficulty seeing even when Wearing - # glasses?" 1 -> Yes - cps["is_blind"] = person.PEDISEYE == 1 - DISABILITY_FLAGS = [ - "PEDIS" + i for i in ["DRS", "EAR", "EYE", "OUT", "PHY", "REM"] - ] - cps["is_disabled"] = (person[DISABILITY_FLAGS] == 1).any(axis=1) def children_per_parent(col: str) -> pd.DataFrame: """Calculate number of children in the household using parental @@ -298,6 +307,7 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["is_full_time_college_student"] = person.A_HSCOL == 2 + def add_personal_income_variables( cps: h5py.File, person: DataFrame, year: int ): diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index 64f85dcb..f39e4d46 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -37,3 +37,19 @@ def test_ecps_has_mortgage_interest(): assert sim.calculate("deductible_mortgage_interest").sum() > 1 assert sim.calculate("interest_expense").sum() > 1 + +def test_newborns_and_pregnancies(): + from policyengine_us_data.datasets.cps import EnhancedCPS_2024 + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=EnhancedCPS_2024) + + # Test for unborn children (age < 0) + unborn = sim.calculate("age") < 0 + unborn_count = unborn.sum() + assert unborn_count > 0 + + # Test for newborns (0 <= age < 1) + newborns = (sim.calculate("age") >= 0) & (sim.calculate("age") < 1) + newborn_count = newborns.sum() + assert newborn_count > 0 From a2e98a5791974bbc763e4943279690e4e9d66d62 Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Mon, 17 Feb 2025 16:34:19 +0100 Subject: [PATCH 3/6] lint --- policyengine_us_data/datasets/cps/cps.py | 21 +++++++++---------- .../tests/test_datasets/test_enhanced_cps.py | 1 + policyengine_us_data/utils/loss.py | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index a223c0c6..d34603e4 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -232,7 +232,6 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None: person.A_AGE, ) - # A_SEX is 1 -> male, 2 -> female. cps["is_female"] = person.A_SEX == 2 # "Is...blind or does...have serious difficulty seeing even when Wearing @@ -243,28 +242,29 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None: ] cps["is_disabled"] = (person[DISABILITY_FLAGS] == 1).any(axis=1) - - def _assign_some_newborns_to_pregnancy(age: pd.Series, person: pd.DataFrame) -> pd.Series: - """ Takes an array of ages, returns the new age array with the given percentage of newborns assigned a negative age (in pregnancy).""" + def _assign_some_newborns_to_pregnancy( + age: pd.Series, person: pd.DataFrame + ) -> pd.Series: + """Takes an array of ages, returns the new age array with the given percentage of newborns assigned a negative age (in pregnancy).""" age = np.where( person.A_AGE == 0, np.where( - np.random.randint(0, 2, len(person)), # Random number of 0 or 1 + np.random.randint( + 0, 2, len(person) + ), # Random number of 0 or 1 # If 1 is flipped, select a random number between -0.75 and 0 - # This will represent the pregnany month + # This will represent the pregnany month # At -0.75 the pregnancy month is 0 and at -0.0001 the pregnancy month is 9 - np.random.uniform(-0.75, 0, len(person)), - # If 0 is flipped, the child is a newborn at the age of 0 to 1 + np.random.uniform(-0.75, 0, len(person)), + # If 0 is flipped, the child is a newborn at the age of 0 to 1 np.random.uniform(0, 1, len(person)), ), person.A_AGE, ) return age - cps["age"] = _assign_some_newborns_to_pregnancy(cps["age"]) - def children_per_parent(col: str) -> pd.DataFrame: """Calculate number of children in the household using parental pointers. @@ -307,7 +307,6 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["is_full_time_college_student"] = person.A_HSCOL == 2 - def add_personal_income_variables( cps: h5py.File, person: DataFrame, year: int ): diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index f39e4d46..9c62b103 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -38,6 +38,7 @@ def test_ecps_has_mortgage_interest(): assert sim.calculate("deductible_mortgage_interest").sum() > 1 assert sim.calculate("interest_expense").sum() > 1 + def test_newborns_and_pregnancies(): from policyengine_us_data.datasets.cps import EnhancedCPS_2024 from policyengine_us import Microsimulation diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 2c576c5e..e76a1f55 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -254,8 +254,8 @@ def build_loss_matrix(dataset: type, time_period): # Rough estimate, not CPS derived "real_estate_taxes": 400e9, # Rough estimate between 350bn and 600bn total property tax collections "rent": 735e9, # ACS total uprated by CPI - "is_newborn": 3_491_679, # ACS total of people aged 0 - "is_pregnant": 2_618_759, # 75% of the ACS total of people aged 0 + "is_newborn": 3_491_679, # ACS total of people aged 0 + "is_pregnant": 2_618_759, # 75% of the ACS total of people aged 0 } for variable_name, target in HARD_CODED_TOTALS.items(): From 6755bbb32f76d3b4fbae14ac25621c76271f1ade Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Mon, 17 Feb 2025 16:46:41 +0100 Subject: [PATCH 4/6] fix --- policyengine_us_data/datasets/cps/cps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index d34603e4..ca6759b2 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -263,7 +263,7 @@ def _assign_some_newborns_to_pregnancy( ) return age - cps["age"] = _assign_some_newborns_to_pregnancy(cps["age"]) + cps["age"] = _assign_some_newborns_to_pregnancy(cps["age"], person) def children_per_parent(col: str) -> pd.DataFrame: """Calculate number of children in the household using parental From 4d2c51b3965b498c356b6204c72dbfb7916b3d98 Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Mon, 17 Feb 2025 17:31:50 +0100 Subject: [PATCH 5/6] minor --- policyengine_us_data/utils/loss.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index e76a1f55..d2da83b3 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -254,8 +254,6 @@ def build_loss_matrix(dataset: type, time_period): # Rough estimate, not CPS derived "real_estate_taxes": 400e9, # Rough estimate between 350bn and 600bn total property tax collections "rent": 735e9, # ACS total uprated by CPI - "is_newborn": 3_491_679, # ACS total of people aged 0 - "is_pregnant": 2_618_759, # 75% of the ACS total of people aged 0 } for variable_name, target in HARD_CODED_TOTALS.items(): @@ -348,12 +346,12 @@ def build_loss_matrix(dataset: type, time_period): newborns = (age >= 0) & (age < 1) label = "census/newborns" loss_matrix[label] = sim.map_result(newborns, "person", "household") - targets_array.append(HARD_CODED_TOTALS["is_newborn"]) + targets_array.append(3_491_679) pregnancies = (age >= -0.75) & (age < 0) label = "census/pregnancies" loss_matrix[label] = sim.map_result(pregnancies, "person", "household") - targets_array.append(HARD_CODED_TOTALS["is_pregnant"]) + targets_array.append(2_618_759) if any(loss_matrix.isna().sum() > 0): raise ValueError("Some targets are missing from the loss matrix") From 023ba57961ed3c8d592f6d994e9f7a4b7c93cbfb Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Mon, 17 Feb 2025 18:01:23 +0100 Subject: [PATCH 6/6] rename and lint --- .../tests/test_datasets/test_enhanced_cps.py | 3 ++- policyengine_us_data/utils/loss.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index 3bd99c7c..e6bf595c 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -38,6 +38,7 @@ def test_ecps_has_mortgage_interest(): assert sim.calculate("deductible_mortgage_interest").sum() > 1 assert sim.calculate("deductible_interest_expense").sum() > 1 + def test_newborns_and_pregnancies(): from policyengine_us_data.datasets.cps import EnhancedCPS_2024 from policyengine_us import Microsimulation @@ -52,4 +53,4 @@ def test_newborns_and_pregnancies(): # Test for newborns (0 <= age < 1) newborns = (sim.calculate("age") >= 0) & (sim.calculate("age") < 1) newborn_count = newborns.sum() - assert newborn_count > 0 \ No newline at end of file + assert newborn_count > 0 diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index d2da83b3..b743dc46 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -343,9 +343,9 @@ def build_loss_matrix(dataset: type, time_period): # Population by number of newborns and pregancies age = sim.calculate("age").values - newborns = (age >= 0) & (age < 1) - label = "census/newborns" - loss_matrix[label] = sim.map_result(newborns, "person", "household") + infants = (age >= 0) & (age < 1) + label = "census/infants" + loss_matrix[label] = sim.map_result(infants, "person", "household") targets_array.append(3_491_679) pregnancies = (age >= -0.75) & (age < 0)