diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..90860eaf 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Immigration status mapping. diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index f932e0d5..a7eeb9a1 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -1410,6 +1410,9 @@ def get_arrival_year_midpoint(peinusyr): len(person), "LEGAL_PERMANENT_RESIDENT", dtype="U32" ) + # Set citizens (SSN card type 1) to CITIZEN status + immigration_status[ssn_card_type == 1] = "CITIZEN" + # 1. Undocumented: SSN card type 0 who arrived 1982 or later arrived_before_1982 = np.isin(person.PEINUSYR, [1, 2, 3, 4, 5, 6, 7]) undoc_mask = (ssn_card_type == 0) & (~arrived_before_1982) @@ -1459,7 +1462,8 @@ def get_arrival_year_midpoint(peinusyr): immigration_status[mask] = "TPS" # Final write (all values now in ImmigrationStatus Enum) - cps["immigration_status"] = immigration_status.astype("S") + # Save as immigration_status_str since that's what PolicyEngine expects + cps["immigration_status_str"] = immigration_status.astype("S") # ============================================================================ # CONVERT TO STRING LABELS AND STORE # ============================================================================ diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index 8a9d8d2b..4634f4bd 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -177,6 +177,54 @@ def test_aca_calibration(): ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}." +def test_immigration_status_diversity(): + """Test that immigration statuses show appropriate diversity (not all citizens).""" + from policyengine_us_data.datasets.cps import EnhancedCPS_2024 + from policyengine_us import Microsimulation + import numpy as np + + sim = Microsimulation(dataset=EnhancedCPS_2024) + + # Get immigration status for all persons (already weighted MicroSeries) + immigration_status = sim.calculate("immigration_status", 2024) + + # Count different statuses + unique_statuses, counts = np.unique(immigration_status, return_counts=True) + + # Calculate percentages using the weights directly + total_population = len(immigration_status) + status_percentages = {} + + for status, count in zip(unique_statuses, counts): + pct = 100 * count / total_population + status_percentages[status] = pct + print(f" {status}: {count:,} ({pct:.1f}%)") + + # Test that not everyone is a citizen (would indicate default value being used) + citizen_pct = status_percentages.get("CITIZEN", 0) + + # Fail if more than 99% are citizens (indicating the default is being used) + assert citizen_pct < 99, ( + f"Too many citizens ({citizen_pct:.1f}%) - likely using default value. " + "Immigration status not being read from data." + ) + + # Also check that we have a reasonable percentage of citizens (should be 85-90%) + assert ( + 80 < citizen_pct < 95 + ), f"Citizen percentage ({citizen_pct:.1f}%) outside expected range (80-95%)" + + # Check that we have some non-citizens + non_citizen_pct = 100 - citizen_pct + assert ( + non_citizen_pct > 5 + ), f"Too few non-citizens ({non_citizen_pct:.1f}%) - expected at least 5%" + + print( + f"Immigration status diversity test passed: {citizen_pct:.1f}% citizens" + ) + + def test_medicaid_calibration(): import pandas as pd