From 146a06a92b4d3009668f46238cad6cf16dc982cd Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Wed, 4 Feb 2026 14:45:29 -0500
Subject: [PATCH 1/4] Improve random() to use name-based salting for
 order-independent reproducibility

Replace the global execution counter with variable-name-based salting to
prevent the "ripple effect" where adding, removing, or reordering variables
changes random values for ALL subsequent variables.

Changes:
- Add _stable_string_hash() for deterministic hashing across Python processes
- Modify random() to use tracer stack variable name instead of global counter
- Add optional salt parameter for use outside formula context
- Update tests to verify order independence and reproducibility

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 policyengine_core/commons/formulas.py  |  73 +++++--
 tests/core/commons/test_random_seed.py | 274 ++++++++++++++++---------
 2 files changed, 233 insertions(+), 114 deletions(-)

diff --git a/policyengine_core/commons/formulas.py b/policyengine_core/commons/formulas.py
index 2bd1c494..12509c44 100644
--- a/policyengine_core/commons/formulas.py
+++ b/policyengine_core/commons/formulas.py
@@ -305,38 +305,87 @@ def amount_between(
     return clip(amount, threshold_1, threshold_2) - threshold_1
 
 
-def random(population):
+def _stable_string_hash(s: str) -> np.uint64:
+    """Deterministic hash consistent across Python processes.
+
+    Python's built-in hash() is not deterministic across processes (since 3.3),
+    so we use a polynomial rolling hash with additional mixing.
+    """
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", "overflow encountered", RuntimeWarning)
+        h = np.uint64(0)
+        for byte in s.encode("utf-8"):
+            h = h * np.uint64(31) + np.uint64(byte)
+        h = h ^ (h >> np.uint64(33))
+        h = h * np.uint64(0xFF51AFD7ED558CCD)
+        h = h ^ (h >> np.uint64(33))
+    return h
+
+
+def random(population, salt: str = None):
     """
     Generate random values for each entity in the population.
 
+    Uses the current variable name (from the tracer stack) to create
+    deterministic random values that are independent of variable execution
+    order. This prevents the "ripple effect" where adding or reordering
+    variables would change random values for all subsequent variables.
+
     Args:
         population: The population object containing simulation data.
+        salt: Optional salt string to differentiate random streams when
+              called outside a formula context or to create distinct
+              streams within the same formula.
 
     Returns:
-        np.ndarray: Array of random values for each entity.
+        np.ndarray: Array of random values in [0, 1) for each entity.
+
+    Raises:
+        ValueError: If called outside a formula context without a salt.
     """
-    # Initialize count of random calls if not already present
-    if not hasattr(population.simulation, "count_random_calls"):
-        population.simulation.count_random_calls = 0
-    population.simulation.count_random_calls += 1
+    simulation = population.simulation
+
+    # Get variable name from tracer stack
+    if simulation.tracer.stack:
+        variable_name = simulation.tracer.stack[-1]["name"]
+    elif salt is not None:
+        variable_name = "__no_variable__"
+    else:
+        raise ValueError(
+            "random() called outside formula context without a salt. "
+            "Provide salt parameter: random(population, salt='unique_name')"
+        )
+
+    # Per-variable counter (replaces global counter)
+    if not hasattr(simulation, "random_call_counts"):
+        simulation.random_call_counts = {}
+
+    counter_key = f"{variable_name}:{salt or ''}"
+    simulation.random_call_counts[counter_key] = (
+        simulation.random_call_counts.get(counter_key, 0) + 1
+    )
+    call_count = simulation.random_call_counts[counter_key]
 
     # Get known periods or use default calculation period
-    known_periods = population.simulation.get_holder(
+    known_periods = simulation.get_holder(
         f"{population.entity.key}_id"
     ).get_known_periods()
     period = (
         known_periods[0]
         if known_periods
-        else population.simulation.default_calculation_period
+        else simulation.default_calculation_period
     )
 
     # Get entity IDs for the period
     entity_ids = population(f"{population.entity.key}_id", period)
 
-    # Generate deterministic random values using vectorised hash
-    seeds = np.abs(
-        entity_ids * 100 + population.simulation.count_random_calls
-    ).astype(np.uint64)
+    # Create deterministic seed from variable name + call count
+    base_seed = _stable_string_hash(f"{variable_name}:{call_count}:{salt or ''}")
+
+    # Combine with entity IDs via XOR
+    seeds = (entity_ids.astype(np.uint64) ^ base_seed).astype(np.uint64)
 
     # PCG-style mixing function for high-quality pseudo-random generation
     x = seeds * np.uint64(0x5851F42D4C957F2D)
diff --git a/tests/core/commons/test_random_seed.py b/tests/core/commons/test_random_seed.py
index e63abe3f..be57a98d 100644
--- a/tests/core/commons/test_random_seed.py
+++ b/tests/core/commons/test_random_seed.py
@@ -1,152 +1,222 @@
-"""Test the random function with large entity IDs to ensure no overflow."""
+"""Test the random function with name-based salting."""
 
 import numpy as np
 import pytest
 from unittest.mock import Mock
-from policyengine_core.commons.formulas import random
+from policyengine_core.commons.formulas import random, _stable_string_hash
+
+
+def create_mock_population(entity_key="person", variable_name="test_variable"):
+    """Helper to create a mock population with tracer stack."""
+    population = Mock()
+
+    # Create simulation as a simple object to avoid Mock's hasattr behavior
+    class MockSimulation:
+        pass
+
+    simulation = MockSimulation()
+    simulation.tracer = Mock()
+    simulation.tracer.stack = [
+        {"name": variable_name, "period": "2024", "branch_name": "default"}
+    ]
+
+    holder = Mock()
+    holder.get_known_periods.return_value = []
+    simulation.get_holder = Mock(return_value=holder)
+    simulation.default_calculation_period = Mock()
+
+    population.simulation = simulation
+    population.entity = Mock()
+    population.entity.key = entity_key
+
+    return population
+
+
+class TestStableStringHash:
+    """Test the stable string hash function."""
+
+    def test_hash_consistency(self):
+        """Test that hash produces consistent results."""
+        s = "test_variable:1:"
+        h1 = _stable_string_hash(s)
+        h2 = _stable_string_hash(s)
+        assert h1 == h2
+
+    def test_hash_different_inputs(self):
+        """Test that different inputs produce different hashes."""
+        h1 = _stable_string_hash("variable_a:1:")
+        h2 = _stable_string_hash("variable_b:1:")
+        assert h1 != h2
+
+    def test_hash_returns_uint64(self):
+        """Test that hash returns numpy uint64."""
+        h = _stable_string_hash("test")
+        assert isinstance(h, np.uint64)
 
 
 class TestRandomSeed:
-    """Test random seed handling to prevent NumPy overflow errors."""
+    """Test random seed handling with name-based salting."""
 
     def test_random_with_large_entity_ids(self):
         """Test that random() handles large entity IDs without overflow."""
-        # Create a mock population with simulation
-        population = Mock()
-        population.simulation = Mock()
-        population.simulation.count_random_calls = 0
-        population.entity = Mock()
-        population.entity.key = "person"
-
-        # Mock the get_holder and get_known_periods
-        holder = Mock()
-        holder.get_known_periods.return_value = []
-        population.simulation.get_holder.return_value = holder
-        population.simulation.default_calculation_period = Mock()
-
-        # Test with very large entity IDs that would cause overflow
-        # if not handled properly
-        large_ids = np.array(
-            [
-                np.iinfo(np.int64).max - 1000,  # Very large positive ID
-                np.iinfo(np.int64).max // 2,  # Large positive ID
-                1234567890123456789,  # Another large ID
-            ]
-        )
-
-        # Mock the population call to return large IDs
+        population = create_mock_population()
+        large_ids = np.array([
+            np.iinfo(np.int64).max - 1000,
+            np.iinfo(np.int64).max // 2,
+            1234567890123456789,
+        ])
         population.side_effect = lambda key, period: large_ids
 
-        # This should not raise a ValueError about negative seeds
         result = random(population)
 
-        # Check that we got valid random values
         assert isinstance(result, np.ndarray)
         assert len(result) == len(large_ids)
         assert all(0 <= val <= 1 for val in result)
 
     def test_random_seed_consistency(self):
         """Test that random() produces consistent results for same inputs."""
-        # Create mock population
-        population = Mock()
-        population.simulation = Mock()
-        population.simulation.count_random_calls = 0
-        population.entity = Mock()
-        population.entity.key = "household"
-
-        holder = Mock()
-        holder.get_known_periods.return_value = []
-        population.simulation.get_holder.return_value = holder
-        population.simulation.default_calculation_period = Mock()
-
-        # Use same IDs
+        population = create_mock_population(entity_key="household")
         ids = np.array([1, 2, 3])
         population.side_effect = lambda key, period: ids
 
-        # First call
         result1 = random(population)
 
-        # Reset count to simulate same conditions
-        population.simulation.count_random_calls = 0
+        # Reset call counts to simulate same conditions
+        population.simulation.random_call_counts = {}
 
-        # Second call with same conditions
         result2 = random(population)
 
-        # Results should be identical
         np.testing.assert_array_equal(result1, result2)
 
-    def test_random_increments_call_count(self):
-        """Test that random() increments the call counter."""
-        population = Mock()
-        population.simulation = Mock()
-        population.simulation.count_random_calls = 0
-        population.entity = Mock()
-        population.entity.key = "person"
-
-        holder = Mock()
-        holder.get_known_periods.return_value = []
-        population.simulation.get_holder.return_value = holder
-        population.simulation.default_calculation_period = Mock()
-
+    def test_random_increments_per_variable_counter(self):
+        """Test that random() increments the per-variable call counter."""
+        population = create_mock_population()
         ids = np.array([1, 2, 3])
         population.side_effect = lambda key, period: ids
 
-        # First call
         random(population)
-        assert population.simulation.count_random_calls == 1
+        assert population.simulation.random_call_counts["test_variable:"] == 1
 
-        # Second call
         random(population)
-        assert population.simulation.count_random_calls == 2
+        assert population.simulation.random_call_counts["test_variable:"] == 2
 
     def test_random_handles_negative_ids(self):
         """Test that random() handles negative IDs properly."""
-        population = Mock()
-        population.simulation = Mock()
-        population.simulation.count_random_calls = 0
-        population.entity = Mock()
-        population.entity.key = "person"
-
-        holder = Mock()
-        holder.get_known_periods.return_value = []
-        population.simulation.get_holder.return_value = holder
-        population.simulation.default_calculation_period = Mock()
-
-        # Include negative IDs
+        population = create_mock_population()
         ids = np.array([-100, -1, 0, 1, 100])
         population.side_effect = lambda key, period: ids
 
-        # Should handle negative IDs without errors
         result = random(population)
 
         assert isinstance(result, np.ndarray)
         assert len(result) == len(ids)
         assert all(0 <= val <= 1 for val in result)
 
-    def test_no_negative_seed_error_with_overflow(self):
-        """Test that seed calculation overflow doesn't cause negative seed error."""
-        population = Mock()
-        population.simulation = Mock()
-        population.simulation.count_random_calls = 999999999  # Large count
-        population.entity = Mock()
-        population.entity.key = "person"
-
-        holder = Mock()
-        holder.get_known_periods.return_value = []
-        population.simulation.get_holder.return_value = holder
-        population.simulation.default_calculation_period = Mock()
-
-        # Use the exact ID that would cause overflow in old implementation
-        # This ID when multiplied by 100 and added to count_random_calls
-        # would overflow int64 and become negative
-        overflow_id = np.array([np.iinfo(np.int64).max // 100])
-        population.side_effect = lambda key, period: overflow_id
-
-        # In the old implementation, this would raise:
-        # ValueError: Seed must be between 0 and 2**32 - 1
-        # With the fix using abs(), it should work fine
-        result = random(population)
+    def test_random_order_independence(self):
+        """Test that variable order doesn't affect random values."""
+        ids = np.array([1, 2, 3])
+
+        # Simulate calling variable_a first, then variable_b
+        pop_a = create_mock_population(variable_name="variable_a")
+        pop_a.side_effect = lambda key, period: ids
+        result_a_first = random(pop_a)
+
+        pop_b = create_mock_population(variable_name="variable_b")
+        pop_b.side_effect = lambda key, period: ids
+        result_b_after_a = random(pop_b)
+
+        # Now simulate calling variable_b first, then variable_a
+        pop_b2 = create_mock_population(variable_name="variable_b")
+        pop_b2.side_effect = lambda key, period: ids
+        result_b_first = random(pop_b2)
+
+        pop_a2 = create_mock_population(variable_name="variable_a")
+        pop_a2.side_effect = lambda key, period: ids
+        result_a_after_b = random(pop_a2)
+
+        # Same variable should produce same results regardless of order
+        np.testing.assert_array_equal(result_a_first, result_a_after_b)
+        np.testing.assert_array_equal(result_b_after_a, result_b_first)
+
+    def test_random_multiple_calls_same_formula(self):
+        """Test that multiple calls within same formula produce different values."""
+        population = create_mock_population()
+        ids = np.array([1, 2, 3])
+        population.side_effect = lambda key, period: ids
+
+        result1 = random(population)
+        result2 = random(population)
+        result3 = random(population)
+
+        # Each call should produce different values
+        assert not np.array_equal(result1, result2)
+        assert not np.array_equal(result2, result3)
+        assert not np.array_equal(result1, result3)
+
+    def test_random_with_salt(self):
+        """Test that salt parameter creates distinct random streams."""
+        population = create_mock_population()
+        ids = np.array([1, 2, 3])
+        population.side_effect = lambda key, period: ids
+
+        result_no_salt = random(population)
+
+        # Reset counter
+        population.simulation.random_call_counts = {}
+
+        result_with_salt = random(population, salt="custom_salt")
+
+        # Different salts should produce different values
+        assert not np.array_equal(result_no_salt, result_with_salt)
+
+    def test_random_outside_formula_raises(self):
+        """Test that calling random() outside formula context without salt raises."""
+        population = create_mock_population()
+        population.simulation.tracer.stack = []  # Empty stack
+        ids = np.array([1, 2, 3])
+        population.side_effect = lambda key, period: ids
+
+        with pytest.raises(ValueError, match="random\\(\\) called outside formula context"):
+            random(population)
+
+    def test_random_outside_formula_with_salt(self):
+        """Test that salt allows random() to work outside formula context."""
+        population = create_mock_population()
+        population.simulation.tracer.stack = []  # Empty stack
+        ids = np.array([1, 2, 3])
+        population.side_effect = lambda key, period: ids
+
+        result = random(population, salt="explicit_salt")
 
         assert isinstance(result, np.ndarray)
-        assert len(result) == 1
-        assert 0 <= result[0] <= 1
+        assert len(result) == len(ids)
+        assert all(0 <= val <= 1 for val in result)
+
+    def test_different_variables_produce_different_values(self):
+        """Test that different variable names produce different random values."""
+        ids = np.array([1, 2, 3])
+
+        pop_a = create_mock_population(variable_name="variable_a")
+        pop_a.side_effect = lambda key, period: ids
+        result_a = random(pop_a)
+
+        pop_b = create_mock_population(variable_name="variable_b")
+        pop_b.side_effect = lambda key, period: ids
+        result_b = random(pop_b)
+
+        # Different variables should produce different values
+        assert not np.array_equal(result_a, result_b)
+
+    def test_same_entity_same_variable_same_value(self):
+        """Test reproducibility: same entity + variable = same random value."""
+        ids = np.array([42])
+
+        pop1 = create_mock_population(variable_name="my_variable")
+        pop1.side_effect = lambda key, period: ids
+        result1 = random(pop1)
+
+        pop2 = create_mock_population(variable_name="my_variable")
+        pop2.side_effect = lambda key, period: ids
+        result2 = random(pop2)
+
+        np.testing.assert_array_equal(result1, result2)

From 41f175533ce7823495f4c8ac4bc2236a3c928f8f Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Wed, 4 Feb 2026 14:47:39 -0500
Subject: [PATCH 2/4] Add changelog entry for random() name-based salting

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 changelog_entry.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e69de29b..c356b2f3 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: major
+  changes:
+    changed:
+      - Replace global execution counter in random() with variable-name-based salting for order-independent reproducibility. This is a breaking change that will produce different random values for existing simulations.

From 17bbd31ef11b6b5e65ab22bdc7ec3be2012b03c3 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Wed, 4 Feb 2026 14:49:29 -0500
Subject: [PATCH 3/4] Apply black formatting

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 policyengine_core/commons/formulas.py       |  8 ++++++--
 policyengine_core/populations/population.py |  8 ++++++--
 tests/core/commons/test_random_seed.py      | 16 ++++++++++------
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/policyengine_core/commons/formulas.py b/policyengine_core/commons/formulas.py
index 12509c44..9b5443bb 100644
--- a/policyengine_core/commons/formulas.py
+++ b/policyengine_core/commons/formulas.py
@@ -314,7 +314,9 @@ def _stable_string_hash(s: str) -> np.uint64:
     import warnings
 
     with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", "overflow encountered", RuntimeWarning)
+        warnings.filterwarnings(
+            "ignore", "overflow encountered", RuntimeWarning
+        )
         h = np.uint64(0)
         for byte in s.encode("utf-8"):
             h = h * np.uint64(31) + np.uint64(byte)
@@ -382,7 +384,9 @@ def random(population, salt: str = None):
     entity_ids = population(f"{population.entity.key}_id", period)
 
     # Create deterministic seed from variable name + call count
-    base_seed = _stable_string_hash(f"{variable_name}:{call_count}:{salt or ''}")
+    base_seed = _stable_string_hash(
+        f"{variable_name}:{call_count}:{salt or ''}"
+    )
 
     # Combine with entity IDs via XOR
     seeds = (entity_ids.astype(np.uint64) ^ base_seed).astype(np.uint64)
diff --git a/policyengine_core/populations/population.py b/policyengine_core/populations/population.py
index 988485ec..c2d9d0f4 100644
--- a/policyengine_core/populations/population.py
+++ b/policyengine_core/populations/population.py
@@ -78,13 +78,17 @@ def check_period_validity(
         if period is None:
             stack = traceback.extract_stack()
             filename, line_number, function_name, line_of_code = stack[-3]
-            raise ValueError("""
+            raise ValueError(
+                """
 You requested computation of variable "{}", but you did not specify on which period in "{}:{}":
     {}
 When you request the computation of a variable within a formula, you must always specify the period as the second parameter. The convention is to call this parameter "period". For example:
     computed_salary = person('salary', period).
 See more information at <https://openfisca.org/doc/coding-the-legislation/35_periods.html#periods-in-variable-definition>.
-""".format(variable_name, filename, line_number, line_of_code))
+""".format(
+                    variable_name, filename, line_number, line_of_code
+                )
+            )
 
     def __call__(
         self,
diff --git a/tests/core/commons/test_random_seed.py b/tests/core/commons/test_random_seed.py
index be57a98d..a5c7e82f 100644
--- a/tests/core/commons/test_random_seed.py
+++ b/tests/core/commons/test_random_seed.py
@@ -60,11 +60,13 @@ class TestRandomSeed:
     def test_random_with_large_entity_ids(self):
         """Test that random() handles large entity IDs without overflow."""
         population = create_mock_population()
-        large_ids = np.array([
-            np.iinfo(np.int64).max - 1000,
-            np.iinfo(np.int64).max // 2,
-            1234567890123456789,
-        ])
+        large_ids = np.array(
+            [
+                np.iinfo(np.int64).max - 1000,
+                np.iinfo(np.int64).max // 2,
+                1234567890123456789,
+            ]
+        )
         population.side_effect = lambda key, period: large_ids
 
         result = random(population)
@@ -176,7 +178,9 @@ def test_random_outside_formula_raises(self):
         ids = np.array([1, 2, 3])
         population.side_effect = lambda key, period: ids
 
-        with pytest.raises(ValueError, match="random\\(\\) called outside formula context"):
+        with pytest.raises(
+            ValueError, match="random\\(\\) called outside formula context"
+        ):
             random(population)
 
     def test_random_outside_formula_with_salt(self):

From 0fb48228f9b5ba7700b924c8e4ff6240b3834e8b Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Wed, 4 Feb 2026 14:52:04 -0500
Subject: [PATCH 4/4] Revert unrelated population.py formatting change

The local black version (25.12.0) formats differently than CI's black.
Reverting this unrelated file to avoid CI lint failure.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 policyengine_core/populations/population.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/policyengine_core/populations/population.py b/policyengine_core/populations/population.py
index c2d9d0f4..988485ec 100644
--- a/policyengine_core/populations/population.py
+++ b/policyengine_core/populations/population.py
@@ -78,17 +78,13 @@ def check_period_validity(
         if period is None:
             stack = traceback.extract_stack()
             filename, line_number, function_name, line_of_code = stack[-3]
-            raise ValueError(
-                """
+            raise ValueError("""
 You requested computation of variable "{}", but you did not specify on which period in "{}:{}":
     {}
 When you request the computation of a variable within a formula, you must always specify the period as the second parameter. The convention is to call this parameter "period". For example:
     computed_salary = person('salary', period).
 See more information at <https://openfisca.org/doc/coding-the-legislation/35_periods.html#periods-in-variable-definition>.
-""".format(
-                    variable_name, filename, line_number, line_of_code
-                )
-            )
+""".format(variable_name, filename, line_number, line_of_code))
 
     def __call__(
         self,