PolicyEngine · PavelMakarchuk · Sep 24, 2025 · Sep 24, 2025 · Sep 24, 2025
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: patch
+  changes:
+    fixed:
+    - Immigration status mapping.
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -1410,6 +1410,9 @@ def get_arrival_year_midpoint(peinusyr):
         len(person), "LEGAL_PERMANENT_RESIDENT", dtype="U32"
     )
 
+    # Set citizens (SSN card type 1) to CITIZEN status
+    immigration_status[ssn_card_type == 1] = "CITIZEN"
+
     # 1. Undocumented: SSN card type 0 who arrived 1982 or later
     arrived_before_1982 = np.isin(person.PEINUSYR, [1, 2, 3, 4, 5, 6, 7])
     undoc_mask = (ssn_card_type == 0) & (~arrived_before_1982)
@@ -1459,7 +1462,8 @@ def get_arrival_year_midpoint(peinusyr):
     immigration_status[mask] = "TPS"
 
     # Final write (all values now in ImmigrationStatus Enum)
-    cps["immigration_status"] = immigration_status.astype("S")
+    # Save as immigration_status_str since that's what PolicyEngine expects
+    cps["immigration_status_str"] = immigration_status.astype("S")
     # ============================================================================
     # CONVERT TO STRING LABELS AND STORE
     # ============================================================================

diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
@@ -177,6 +177,54 @@ def test_aca_calibration():
     ), f"One or more states exceeded tolerance of {TOLERANCE:.0%}."
 
 
+def test_immigration_status_diversity():
+    """Test that immigration statuses show appropriate diversity (not all citizens)."""
+    from policyengine_us_data.datasets.cps import EnhancedCPS_2024
+    from policyengine_us import Microsimulation
+    import numpy as np
+
+    sim = Microsimulation(dataset=EnhancedCPS_2024)
+
+    # Get immigration status for all persons (already weighted MicroSeries)
+    immigration_status = sim.calculate("immigration_status", 2024)
+
+    # Count different statuses
+    unique_statuses, counts = np.unique(immigration_status, return_counts=True)
+
+    # Calculate percentages using the weights directly
+    total_population = len(immigration_status)
+    status_percentages = {}
+
+    for status, count in zip(unique_statuses, counts):
+        pct = 100 * count / total_population
+        status_percentages[status] = pct
+        print(f"  {status}: {count:,} ({pct:.1f}%)")
+
+    # Test that not everyone is a citizen (would indicate default value being used)
+    citizen_pct = status_percentages.get("CITIZEN", 0)
+
+    # Fail if more than 99% are citizens (indicating the default is being used)
+    assert citizen_pct < 99, (
+        f"Too many citizens ({citizen_pct:.1f}%) - likely using default value. "
+        "Immigration status not being read from data."
+    )
+
+    # Also check that we have a reasonable percentage of citizens (should be 85-90%)
+    assert (
+        80 < citizen_pct < 95
+    ), f"Citizen percentage ({citizen_pct:.1f}%) outside expected range (80-95%)"
+
+    # Check that we have some non-citizens
+    non_citizen_pct = 100 - citizen_pct
+    assert (
+        non_citizen_pct > 5
+    ), f"Too few non-citizens ({non_citizen_pct:.1f}%) - expected at least 5%"
+
+    print(
+        f"Immigration status diversity test passed: {citizen_pct:.1f}% citizens"
+    )
+
+
 def test_medicaid_calibration():
 
     import pandas as pd