From 146a06a92b4d3009668f46238cad6cf16dc982cd Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Wed, 4 Feb 2026 14:45:29 -0500 Subject: [PATCH 1/4] Improve random() to use name-based salting for order-independent reproducibility Replace the global execution counter with variable-name-based salting to prevent the "ripple effect" where adding, removing, or reordering variables changes random values for ALL subsequent variables. Changes: - Add _stable_string_hash() for deterministic hashing across Python processes - Modify random() to use tracer stack variable name instead of global counter - Add optional salt parameter for use outside formula context - Update tests to verify order independence and reproducibility Co-Authored-By: Claude Opus 4.5 --- policyengine_core/commons/formulas.py | 73 +++++-- tests/core/commons/test_random_seed.py | 274 ++++++++++++++++--------- 2 files changed, 233 insertions(+), 114 deletions(-) diff --git a/policyengine_core/commons/formulas.py b/policyengine_core/commons/formulas.py index 2bd1c494..12509c44 100644 --- a/policyengine_core/commons/formulas.py +++ b/policyengine_core/commons/formulas.py @@ -305,38 +305,87 @@ def amount_between( return clip(amount, threshold_1, threshold_2) - threshold_1 -def random(population): +def _stable_string_hash(s: str) -> np.uint64: + """Deterministic hash consistent across Python processes. + + Python's built-in hash() is not deterministic across processes (since 3.3), + so we use a polynomial rolling hash with additional mixing. + """ + import warnings + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "overflow encountered", RuntimeWarning) + h = np.uint64(0) + for byte in s.encode("utf-8"): + h = h * np.uint64(31) + np.uint64(byte) + h = h ^ (h >> np.uint64(33)) + h = h * np.uint64(0xFF51AFD7ED558CCD) + h = h ^ (h >> np.uint64(33)) + return h + + +def random(population, salt: str = None): """ Generate random values for each entity in the population. + Uses the current variable name (from the tracer stack) to create + deterministic random values that are independent of variable execution + order. This prevents the "ripple effect" where adding or reordering + variables would change random values for all subsequent variables. + Args: population: The population object containing simulation data. + salt: Optional salt string to differentiate random streams when + called outside a formula context or to create distinct + streams within the same formula. Returns: - np.ndarray: Array of random values for each entity. + np.ndarray: Array of random values in [0, 1) for each entity. + + Raises: + ValueError: If called outside a formula context without a salt. """ - # Initialize count of random calls if not already present - if not hasattr(population.simulation, "count_random_calls"): - population.simulation.count_random_calls = 0 - population.simulation.count_random_calls += 1 + simulation = population.simulation + + # Get variable name from tracer stack + if simulation.tracer.stack: + variable_name = simulation.tracer.stack[-1]["name"] + elif salt is not None: + variable_name = "__no_variable__" + else: + raise ValueError( + "random() called outside formula context without a salt. " + "Provide salt parameter: random(population, salt='unique_name')" + ) + + # Per-variable counter (replaces global counter) + if not hasattr(simulation, "random_call_counts"): + simulation.random_call_counts = {} + + counter_key = f"{variable_name}:{salt or ''}" + simulation.random_call_counts[counter_key] = ( + simulation.random_call_counts.get(counter_key, 0) + 1 + ) + call_count = simulation.random_call_counts[counter_key] # Get known periods or use default calculation period - known_periods = population.simulation.get_holder( + known_periods = simulation.get_holder( f"{population.entity.key}_id" ).get_known_periods() period = ( known_periods[0] if known_periods - else population.simulation.default_calculation_period + else simulation.default_calculation_period ) # Get entity IDs for the period entity_ids = population(f"{population.entity.key}_id", period) - # Generate deterministic random values using vectorised hash - seeds = np.abs( - entity_ids * 100 + population.simulation.count_random_calls - ).astype(np.uint64) + # Create deterministic seed from variable name + call count + base_seed = _stable_string_hash(f"{variable_name}:{call_count}:{salt or ''}") + + # Combine with entity IDs via XOR + seeds = (entity_ids.astype(np.uint64) ^ base_seed).astype(np.uint64) # PCG-style mixing function for high-quality pseudo-random generation x = seeds * np.uint64(0x5851F42D4C957F2D) diff --git a/tests/core/commons/test_random_seed.py b/tests/core/commons/test_random_seed.py index e63abe3f..be57a98d 100644 --- a/tests/core/commons/test_random_seed.py +++ b/tests/core/commons/test_random_seed.py @@ -1,152 +1,222 @@ -"""Test the random function with large entity IDs to ensure no overflow.""" +"""Test the random function with name-based salting.""" import numpy as np import pytest from unittest.mock import Mock -from policyengine_core.commons.formulas import random +from policyengine_core.commons.formulas import random, _stable_string_hash + + +def create_mock_population(entity_key="person", variable_name="test_variable"): + """Helper to create a mock population with tracer stack.""" + population = Mock() + + # Create simulation as a simple object to avoid Mock's hasattr behavior + class MockSimulation: + pass + + simulation = MockSimulation() + simulation.tracer = Mock() + simulation.tracer.stack = [ + {"name": variable_name, "period": "2024", "branch_name": "default"} + ] + + holder = Mock() + holder.get_known_periods.return_value = [] + simulation.get_holder = Mock(return_value=holder) + simulation.default_calculation_period = Mock() + + population.simulation = simulation + population.entity = Mock() + population.entity.key = entity_key + + return population + + +class TestStableStringHash: + """Test the stable string hash function.""" + + def test_hash_consistency(self): + """Test that hash produces consistent results.""" + s = "test_variable:1:" + h1 = _stable_string_hash(s) + h2 = _stable_string_hash(s) + assert h1 == h2 + + def test_hash_different_inputs(self): + """Test that different inputs produce different hashes.""" + h1 = _stable_string_hash("variable_a:1:") + h2 = _stable_string_hash("variable_b:1:") + assert h1 != h2 + + def test_hash_returns_uint64(self): + """Test that hash returns numpy uint64.""" + h = _stable_string_hash("test") + assert isinstance(h, np.uint64) class TestRandomSeed: - """Test random seed handling to prevent NumPy overflow errors.""" + """Test random seed handling with name-based salting.""" def test_random_with_large_entity_ids(self): """Test that random() handles large entity IDs without overflow.""" - # Create a mock population with simulation - population = Mock() - population.simulation = Mock() - population.simulation.count_random_calls = 0 - population.entity = Mock() - population.entity.key = "person" - - # Mock the get_holder and get_known_periods - holder = Mock() - holder.get_known_periods.return_value = [] - population.simulation.get_holder.return_value = holder - population.simulation.default_calculation_period = Mock() - - # Test with very large entity IDs that would cause overflow - # if not handled properly - large_ids = np.array( - [ - np.iinfo(np.int64).max - 1000, # Very large positive ID - np.iinfo(np.int64).max // 2, # Large positive ID - 1234567890123456789, # Another large ID - ] - ) - - # Mock the population call to return large IDs + population = create_mock_population() + large_ids = np.array([ + np.iinfo(np.int64).max - 1000, + np.iinfo(np.int64).max // 2, + 1234567890123456789, + ]) population.side_effect = lambda key, period: large_ids - # This should not raise a ValueError about negative seeds result = random(population) - # Check that we got valid random values assert isinstance(result, np.ndarray) assert len(result) == len(large_ids) assert all(0 <= val <= 1 for val in result) def test_random_seed_consistency(self): """Test that random() produces consistent results for same inputs.""" - # Create mock population - population = Mock() - population.simulation = Mock() - population.simulation.count_random_calls = 0 - population.entity = Mock() - population.entity.key = "household" - - holder = Mock() - holder.get_known_periods.return_value = [] - population.simulation.get_holder.return_value = holder - population.simulation.default_calculation_period = Mock() - - # Use same IDs + population = create_mock_population(entity_key="household") ids = np.array([1, 2, 3]) population.side_effect = lambda key, period: ids - # First call result1 = random(population) - # Reset count to simulate same conditions - population.simulation.count_random_calls = 0 + # Reset call counts to simulate same conditions + population.simulation.random_call_counts = {} - # Second call with same conditions result2 = random(population) - # Results should be identical np.testing.assert_array_equal(result1, result2) - def test_random_increments_call_count(self): - """Test that random() increments the call counter.""" - population = Mock() - population.simulation = Mock() - population.simulation.count_random_calls = 0 - population.entity = Mock() - population.entity.key = "person" - - holder = Mock() - holder.get_known_periods.return_value = [] - population.simulation.get_holder.return_value = holder - population.simulation.default_calculation_period = Mock() - + def test_random_increments_per_variable_counter(self): + """Test that random() increments the per-variable call counter.""" + population = create_mock_population() ids = np.array([1, 2, 3]) population.side_effect = lambda key, period: ids - # First call random(population) - assert population.simulation.count_random_calls == 1 + assert population.simulation.random_call_counts["test_variable:"] == 1 - # Second call random(population) - assert population.simulation.count_random_calls == 2 + assert population.simulation.random_call_counts["test_variable:"] == 2 def test_random_handles_negative_ids(self): """Test that random() handles negative IDs properly.""" - population = Mock() - population.simulation = Mock() - population.simulation.count_random_calls = 0 - population.entity = Mock() - population.entity.key = "person" - - holder = Mock() - holder.get_known_periods.return_value = [] - population.simulation.get_holder.return_value = holder - population.simulation.default_calculation_period = Mock() - - # Include negative IDs + population = create_mock_population() ids = np.array([-100, -1, 0, 1, 100]) population.side_effect = lambda key, period: ids - # Should handle negative IDs without errors result = random(population) assert isinstance(result, np.ndarray) assert len(result) == len(ids) assert all(0 <= val <= 1 for val in result) - def test_no_negative_seed_error_with_overflow(self): - """Test that seed calculation overflow doesn't cause negative seed error.""" - population = Mock() - population.simulation = Mock() - population.simulation.count_random_calls = 999999999 # Large count - population.entity = Mock() - population.entity.key = "person" - - holder = Mock() - holder.get_known_periods.return_value = [] - population.simulation.get_holder.return_value = holder - population.simulation.default_calculation_period = Mock() - - # Use the exact ID that would cause overflow in old implementation - # This ID when multiplied by 100 and added to count_random_calls - # would overflow int64 and become negative - overflow_id = np.array([np.iinfo(np.int64).max // 100]) - population.side_effect = lambda key, period: overflow_id - - # In the old implementation, this would raise: - # ValueError: Seed must be between 0 and 2**32 - 1 - # With the fix using abs(), it should work fine - result = random(population) + def test_random_order_independence(self): + """Test that variable order doesn't affect random values.""" + ids = np.array([1, 2, 3]) + + # Simulate calling variable_a first, then variable_b + pop_a = create_mock_population(variable_name="variable_a") + pop_a.side_effect = lambda key, period: ids + result_a_first = random(pop_a) + + pop_b = create_mock_population(variable_name="variable_b") + pop_b.side_effect = lambda key, period: ids + result_b_after_a = random(pop_b) + + # Now simulate calling variable_b first, then variable_a + pop_b2 = create_mock_population(variable_name="variable_b") + pop_b2.side_effect = lambda key, period: ids + result_b_first = random(pop_b2) + + pop_a2 = create_mock_population(variable_name="variable_a") + pop_a2.side_effect = lambda key, period: ids + result_a_after_b = random(pop_a2) + + # Same variable should produce same results regardless of order + np.testing.assert_array_equal(result_a_first, result_a_after_b) + np.testing.assert_array_equal(result_b_after_a, result_b_first) + + def test_random_multiple_calls_same_formula(self): + """Test that multiple calls within same formula produce different values.""" + population = create_mock_population() + ids = np.array([1, 2, 3]) + population.side_effect = lambda key, period: ids + + result1 = random(population) + result2 = random(population) + result3 = random(population) + + # Each call should produce different values + assert not np.array_equal(result1, result2) + assert not np.array_equal(result2, result3) + assert not np.array_equal(result1, result3) + + def test_random_with_salt(self): + """Test that salt parameter creates distinct random streams.""" + population = create_mock_population() + ids = np.array([1, 2, 3]) + population.side_effect = lambda key, period: ids + + result_no_salt = random(population) + + # Reset counter + population.simulation.random_call_counts = {} + + result_with_salt = random(population, salt="custom_salt") + + # Different salts should produce different values + assert not np.array_equal(result_no_salt, result_with_salt) + + def test_random_outside_formula_raises(self): + """Test that calling random() outside formula context without salt raises.""" + population = create_mock_population() + population.simulation.tracer.stack = [] # Empty stack + ids = np.array([1, 2, 3]) + population.side_effect = lambda key, period: ids + + with pytest.raises(ValueError, match="random\\(\\) called outside formula context"): + random(population) + + def test_random_outside_formula_with_salt(self): + """Test that salt allows random() to work outside formula context.""" + population = create_mock_population() + population.simulation.tracer.stack = [] # Empty stack + ids = np.array([1, 2, 3]) + population.side_effect = lambda key, period: ids + + result = random(population, salt="explicit_salt") assert isinstance(result, np.ndarray) - assert len(result) == 1 - assert 0 <= result[0] <= 1 + assert len(result) == len(ids) + assert all(0 <= val <= 1 for val in result) + + def test_different_variables_produce_different_values(self): + """Test that different variable names produce different random values.""" + ids = np.array([1, 2, 3]) + + pop_a = create_mock_population(variable_name="variable_a") + pop_a.side_effect = lambda key, period: ids + result_a = random(pop_a) + + pop_b = create_mock_population(variable_name="variable_b") + pop_b.side_effect = lambda key, period: ids + result_b = random(pop_b) + + # Different variables should produce different values + assert not np.array_equal(result_a, result_b) + + def test_same_entity_same_variable_same_value(self): + """Test reproducibility: same entity + variable = same random value.""" + ids = np.array([42]) + + pop1 = create_mock_population(variable_name="my_variable") + pop1.side_effect = lambda key, period: ids + result1 = random(pop1) + + pop2 = create_mock_population(variable_name="my_variable") + pop2.side_effect = lambda key, period: ids + result2 = random(pop2) + + np.testing.assert_array_equal(result1, result2) From 41f175533ce7823495f4c8ac4bc2236a3c928f8f Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Wed, 4 Feb 2026 14:47:39 -0500 Subject: [PATCH 2/4] Add changelog entry for random() name-based salting Co-Authored-By: Claude Opus 4.5 --- changelog_entry.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..c356b2f3 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: major + changes: + changed: + - Replace global execution counter in random() with variable-name-based salting for order-independent reproducibility. This is a breaking change that will produce different random values for existing simulations. From 17bbd31ef11b6b5e65ab22bdc7ec3be2012b03c3 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Wed, 4 Feb 2026 14:49:29 -0500 Subject: [PATCH 3/4] Apply black formatting Co-Authored-By: Claude Opus 4.5 --- policyengine_core/commons/formulas.py | 8 ++++++-- policyengine_core/populations/population.py | 8 ++++++-- tests/core/commons/test_random_seed.py | 16 ++++++++++------ 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/policyengine_core/commons/formulas.py b/policyengine_core/commons/formulas.py index 12509c44..9b5443bb 100644 --- a/policyengine_core/commons/formulas.py +++ b/policyengine_core/commons/formulas.py @@ -314,7 +314,9 @@ def _stable_string_hash(s: str) -> np.uint64: import warnings with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "overflow encountered", RuntimeWarning) + warnings.filterwarnings( + "ignore", "overflow encountered", RuntimeWarning + ) h = np.uint64(0) for byte in s.encode("utf-8"): h = h * np.uint64(31) + np.uint64(byte) @@ -382,7 +384,9 @@ def random(population, salt: str = None): entity_ids = population(f"{population.entity.key}_id", period) # Create deterministic seed from variable name + call count - base_seed = _stable_string_hash(f"{variable_name}:{call_count}:{salt or ''}") + base_seed = _stable_string_hash( + f"{variable_name}:{call_count}:{salt or ''}" + ) # Combine with entity IDs via XOR seeds = (entity_ids.astype(np.uint64) ^ base_seed).astype(np.uint64) diff --git a/policyengine_core/populations/population.py b/policyengine_core/populations/population.py index 988485ec..c2d9d0f4 100644 --- a/policyengine_core/populations/population.py +++ b/policyengine_core/populations/population.py @@ -78,13 +78,17 @@ def check_period_validity( if period is None: stack = traceback.extract_stack() filename, line_number, function_name, line_of_code = stack[-3] - raise ValueError(""" + raise ValueError( + """ You requested computation of variable "{}", but you did not specify on which period in "{}:{}": {} When you request the computation of a variable within a formula, you must always specify the period as the second parameter. The convention is to call this parameter "period". For example: computed_salary = person('salary', period). See more information at . -""".format(variable_name, filename, line_number, line_of_code)) +""".format( + variable_name, filename, line_number, line_of_code + ) + ) def __call__( self, diff --git a/tests/core/commons/test_random_seed.py b/tests/core/commons/test_random_seed.py index be57a98d..a5c7e82f 100644 --- a/tests/core/commons/test_random_seed.py +++ b/tests/core/commons/test_random_seed.py @@ -60,11 +60,13 @@ class TestRandomSeed: def test_random_with_large_entity_ids(self): """Test that random() handles large entity IDs without overflow.""" population = create_mock_population() - large_ids = np.array([ - np.iinfo(np.int64).max - 1000, - np.iinfo(np.int64).max // 2, - 1234567890123456789, - ]) + large_ids = np.array( + [ + np.iinfo(np.int64).max - 1000, + np.iinfo(np.int64).max // 2, + 1234567890123456789, + ] + ) population.side_effect = lambda key, period: large_ids result = random(population) @@ -176,7 +178,9 @@ def test_random_outside_formula_raises(self): ids = np.array([1, 2, 3]) population.side_effect = lambda key, period: ids - with pytest.raises(ValueError, match="random\\(\\) called outside formula context"): + with pytest.raises( + ValueError, match="random\\(\\) called outside formula context" + ): random(population) def test_random_outside_formula_with_salt(self): From 0fb48228f9b5ba7700b924c8e4ff6240b3834e8b Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Wed, 4 Feb 2026 14:52:04 -0500 Subject: [PATCH 4/4] Revert unrelated population.py formatting change The local black version (25.12.0) formats differently than CI's black. Reverting this unrelated file to avoid CI lint failure. Co-Authored-By: Claude Opus 4.5 --- policyengine_core/populations/population.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/policyengine_core/populations/population.py b/policyengine_core/populations/population.py index c2d9d0f4..988485ec 100644 --- a/policyengine_core/populations/population.py +++ b/policyengine_core/populations/population.py @@ -78,17 +78,13 @@ def check_period_validity( if period is None: stack = traceback.extract_stack() filename, line_number, function_name, line_of_code = stack[-3] - raise ValueError( - """ + raise ValueError(""" You requested computation of variable "{}", but you did not specify on which period in "{}:{}": {} When you request the computation of a variable within a formula, you must always specify the period as the second parameter. The convention is to call this parameter "period". For example: computed_salary = person('salary', period). See more information at . -""".format( - variable_name, filename, line_number, line_of_code - ) - ) +""".format(variable_name, filename, line_number, line_of_code)) def __call__( self,