diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..c356b2f3 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: major + changes: + changed: + - Replace global execution counter in random() with variable-name-based salting for order-independent reproducibility. This is a breaking change that will produce different random values for existing simulations. diff --git a/policyengine_core/commons/formulas.py b/policyengine_core/commons/formulas.py index 2bd1c494..9b5443bb 100644 --- a/policyengine_core/commons/formulas.py +++ b/policyengine_core/commons/formulas.py @@ -305,38 +305,91 @@ def amount_between( return clip(amount, threshold_1, threshold_2) - threshold_1 -def random(population): +def _stable_string_hash(s: str) -> np.uint64: + """Deterministic hash consistent across Python processes. + + Python's built-in hash() is not deterministic across processes (since 3.3), + so we use a polynomial rolling hash with additional mixing. + """ + import warnings + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "overflow encountered", RuntimeWarning + ) + h = np.uint64(0) + for byte in s.encode("utf-8"): + h = h * np.uint64(31) + np.uint64(byte) + h = h ^ (h >> np.uint64(33)) + h = h * np.uint64(0xFF51AFD7ED558CCD) + h = h ^ (h >> np.uint64(33)) + return h + + +def random(population, salt: str = None): """ Generate random values for each entity in the population. + Uses the current variable name (from the tracer stack) to create + deterministic random values that are independent of variable execution + order. This prevents the "ripple effect" where adding or reordering + variables would change random values for all subsequent variables. + Args: population: The population object containing simulation data. + salt: Optional salt string to differentiate random streams when + called outside a formula context or to create distinct + streams within the same formula. Returns: - np.ndarray: Array of random values for each entity. + np.ndarray: Array of random values in [0, 1) for each entity. + + Raises: + ValueError: If called outside a formula context without a salt. """ - # Initialize count of random calls if not already present - if not hasattr(population.simulation, "count_random_calls"): - population.simulation.count_random_calls = 0 - population.simulation.count_random_calls += 1 + simulation = population.simulation + + # Get variable name from tracer stack + if simulation.tracer.stack: + variable_name = simulation.tracer.stack[-1]["name"] + elif salt is not None: + variable_name = "__no_variable__" + else: + raise ValueError( + "random() called outside formula context without a salt. " + "Provide salt parameter: random(population, salt='unique_name')" + ) + + # Per-variable counter (replaces global counter) + if not hasattr(simulation, "random_call_counts"): + simulation.random_call_counts = {} + + counter_key = f"{variable_name}:{salt or ''}" + simulation.random_call_counts[counter_key] = ( + simulation.random_call_counts.get(counter_key, 0) + 1 + ) + call_count = simulation.random_call_counts[counter_key] # Get known periods or use default calculation period - known_periods = population.simulation.get_holder( + known_periods = simulation.get_holder( f"{population.entity.key}_id" ).get_known_periods() period = ( known_periods[0] if known_periods - else population.simulation.default_calculation_period + else simulation.default_calculation_period ) # Get entity IDs for the period entity_ids = population(f"{population.entity.key}_id", period) - # Generate deterministic random values using vectorised hash - seeds = np.abs( - entity_ids * 100 + population.simulation.count_random_calls - ).astype(np.uint64) + # Create deterministic seed from variable name + call count + base_seed = _stable_string_hash( + f"{variable_name}:{call_count}:{salt or ''}" + ) + + # Combine with entity IDs via XOR + seeds = (entity_ids.astype(np.uint64) ^ base_seed).astype(np.uint64) # PCG-style mixing function for high-quality pseudo-random generation x = seeds * np.uint64(0x5851F42D4C957F2D) diff --git a/tests/core/commons/test_random_seed.py b/tests/core/commons/test_random_seed.py index e63abe3f..a5c7e82f 100644 --- a/tests/core/commons/test_random_seed.py +++ b/tests/core/commons/test_random_seed.py @@ -1,152 +1,226 @@ -"""Test the random function with large entity IDs to ensure no overflow.""" +"""Test the random function with name-based salting.""" import numpy as np import pytest from unittest.mock import Mock -from policyengine_core.commons.formulas import random +from policyengine_core.commons.formulas import random, _stable_string_hash + + +def create_mock_population(entity_key="person", variable_name="test_variable"): + """Helper to create a mock population with tracer stack.""" + population = Mock() + + # Create simulation as a simple object to avoid Mock's hasattr behavior + class MockSimulation: + pass + + simulation = MockSimulation() + simulation.tracer = Mock() + simulation.tracer.stack = [ + {"name": variable_name, "period": "2024", "branch_name": "default"} + ] + + holder = Mock() + holder.get_known_periods.return_value = [] + simulation.get_holder = Mock(return_value=holder) + simulation.default_calculation_period = Mock() + + population.simulation = simulation + population.entity = Mock() + population.entity.key = entity_key + + return population + + +class TestStableStringHash: + """Test the stable string hash function.""" + + def test_hash_consistency(self): + """Test that hash produces consistent results.""" + s = "test_variable:1:" + h1 = _stable_string_hash(s) + h2 = _stable_string_hash(s) + assert h1 == h2 + + def test_hash_different_inputs(self): + """Test that different inputs produce different hashes.""" + h1 = _stable_string_hash("variable_a:1:") + h2 = _stable_string_hash("variable_b:1:") + assert h1 != h2 + + def test_hash_returns_uint64(self): + """Test that hash returns numpy uint64.""" + h = _stable_string_hash("test") + assert isinstance(h, np.uint64) class TestRandomSeed: - """Test random seed handling to prevent NumPy overflow errors.""" + """Test random seed handling with name-based salting.""" def test_random_with_large_entity_ids(self): """Test that random() handles large entity IDs without overflow.""" - # Create a mock population with simulation - population = Mock() - population.simulation = Mock() - population.simulation.count_random_calls = 0 - population.entity = Mock() - population.entity.key = "person" - - # Mock the get_holder and get_known_periods - holder = Mock() - holder.get_known_periods.return_value = [] - population.simulation.get_holder.return_value = holder - population.simulation.default_calculation_period = Mock() - - # Test with very large entity IDs that would cause overflow - # if not handled properly + population = create_mock_population() large_ids = np.array( [ - np.iinfo(np.int64).max - 1000, # Very large positive ID - np.iinfo(np.int64).max // 2, # Large positive ID - 1234567890123456789, # Another large ID + np.iinfo(np.int64).max - 1000, + np.iinfo(np.int64).max // 2, + 1234567890123456789, ] ) - - # Mock the population call to return large IDs population.side_effect = lambda key, period: large_ids - # This should not raise a ValueError about negative seeds result = random(population) - # Check that we got valid random values assert isinstance(result, np.ndarray) assert len(result) == len(large_ids) assert all(0 <= val <= 1 for val in result) def test_random_seed_consistency(self): """Test that random() produces consistent results for same inputs.""" - # Create mock population - population = Mock() - population.simulation = Mock() - population.simulation.count_random_calls = 0 - population.entity = Mock() - population.entity.key = "household" - - holder = Mock() - holder.get_known_periods.return_value = [] - population.simulation.get_holder.return_value = holder - population.simulation.default_calculation_period = Mock() - - # Use same IDs + population = create_mock_population(entity_key="household") ids = np.array([1, 2, 3]) population.side_effect = lambda key, period: ids - # First call result1 = random(population) - # Reset count to simulate same conditions - population.simulation.count_random_calls = 0 + # Reset call counts to simulate same conditions + population.simulation.random_call_counts = {} - # Second call with same conditions result2 = random(population) - # Results should be identical np.testing.assert_array_equal(result1, result2) - def test_random_increments_call_count(self): - """Test that random() increments the call counter.""" - population = Mock() - population.simulation = Mock() - population.simulation.count_random_calls = 0 - population.entity = Mock() - population.entity.key = "person" - - holder = Mock() - holder.get_known_periods.return_value = [] - population.simulation.get_holder.return_value = holder - population.simulation.default_calculation_period = Mock() - + def test_random_increments_per_variable_counter(self): + """Test that random() increments the per-variable call counter.""" + population = create_mock_population() ids = np.array([1, 2, 3]) population.side_effect = lambda key, period: ids - # First call random(population) - assert population.simulation.count_random_calls == 1 + assert population.simulation.random_call_counts["test_variable:"] == 1 - # Second call random(population) - assert population.simulation.count_random_calls == 2 + assert population.simulation.random_call_counts["test_variable:"] == 2 def test_random_handles_negative_ids(self): """Test that random() handles negative IDs properly.""" - population = Mock() - population.simulation = Mock() - population.simulation.count_random_calls = 0 - population.entity = Mock() - population.entity.key = "person" - - holder = Mock() - holder.get_known_periods.return_value = [] - population.simulation.get_holder.return_value = holder - population.simulation.default_calculation_period = Mock() - - # Include negative IDs + population = create_mock_population() ids = np.array([-100, -1, 0, 1, 100]) population.side_effect = lambda key, period: ids - # Should handle negative IDs without errors result = random(population) assert isinstance(result, np.ndarray) assert len(result) == len(ids) assert all(0 <= val <= 1 for val in result) - def test_no_negative_seed_error_with_overflow(self): - """Test that seed calculation overflow doesn't cause negative seed error.""" - population = Mock() - population.simulation = Mock() - population.simulation.count_random_calls = 999999999 # Large count - population.entity = Mock() - population.entity.key = "person" - - holder = Mock() - holder.get_known_periods.return_value = [] - population.simulation.get_holder.return_value = holder - population.simulation.default_calculation_period = Mock() - - # Use the exact ID that would cause overflow in old implementation - # This ID when multiplied by 100 and added to count_random_calls - # would overflow int64 and become negative - overflow_id = np.array([np.iinfo(np.int64).max // 100]) - population.side_effect = lambda key, period: overflow_id - - # In the old implementation, this would raise: - # ValueError: Seed must be between 0 and 2**32 - 1 - # With the fix using abs(), it should work fine - result = random(population) + def test_random_order_independence(self): + """Test that variable order doesn't affect random values.""" + ids = np.array([1, 2, 3]) + + # Simulate calling variable_a first, then variable_b + pop_a = create_mock_population(variable_name="variable_a") + pop_a.side_effect = lambda key, period: ids + result_a_first = random(pop_a) + + pop_b = create_mock_population(variable_name="variable_b") + pop_b.side_effect = lambda key, period: ids + result_b_after_a = random(pop_b) + + # Now simulate calling variable_b first, then variable_a + pop_b2 = create_mock_population(variable_name="variable_b") + pop_b2.side_effect = lambda key, period: ids + result_b_first = random(pop_b2) + + pop_a2 = create_mock_population(variable_name="variable_a") + pop_a2.side_effect = lambda key, period: ids + result_a_after_b = random(pop_a2) + + # Same variable should produce same results regardless of order + np.testing.assert_array_equal(result_a_first, result_a_after_b) + np.testing.assert_array_equal(result_b_after_a, result_b_first) + + def test_random_multiple_calls_same_formula(self): + """Test that multiple calls within same formula produce different values.""" + population = create_mock_population() + ids = np.array([1, 2, 3]) + population.side_effect = lambda key, period: ids + + result1 = random(population) + result2 = random(population) + result3 = random(population) + + # Each call should produce different values + assert not np.array_equal(result1, result2) + assert not np.array_equal(result2, result3) + assert not np.array_equal(result1, result3) + + def test_random_with_salt(self): + """Test that salt parameter creates distinct random streams.""" + population = create_mock_population() + ids = np.array([1, 2, 3]) + population.side_effect = lambda key, period: ids + + result_no_salt = random(population) + + # Reset counter + population.simulation.random_call_counts = {} + + result_with_salt = random(population, salt="custom_salt") + + # Different salts should produce different values + assert not np.array_equal(result_no_salt, result_with_salt) + + def test_random_outside_formula_raises(self): + """Test that calling random() outside formula context without salt raises.""" + population = create_mock_population() + population.simulation.tracer.stack = [] # Empty stack + ids = np.array([1, 2, 3]) + population.side_effect = lambda key, period: ids + + with pytest.raises( + ValueError, match="random\\(\\) called outside formula context" + ): + random(population) + + def test_random_outside_formula_with_salt(self): + """Test that salt allows random() to work outside formula context.""" + population = create_mock_population() + population.simulation.tracer.stack = [] # Empty stack + ids = np.array([1, 2, 3]) + population.side_effect = lambda key, period: ids + + result = random(population, salt="explicit_salt") assert isinstance(result, np.ndarray) - assert len(result) == 1 - assert 0 <= result[0] <= 1 + assert len(result) == len(ids) + assert all(0 <= val <= 1 for val in result) + + def test_different_variables_produce_different_values(self): + """Test that different variable names produce different random values.""" + ids = np.array([1, 2, 3]) + + pop_a = create_mock_population(variable_name="variable_a") + pop_a.side_effect = lambda key, period: ids + result_a = random(pop_a) + + pop_b = create_mock_population(variable_name="variable_b") + pop_b.side_effect = lambda key, period: ids + result_b = random(pop_b) + + # Different variables should produce different values + assert not np.array_equal(result_a, result_b) + + def test_same_entity_same_variable_same_value(self): + """Test reproducibility: same entity + variable = same random value.""" + ids = np.array([42]) + + pop1 = create_mock_population(variable_name="my_variable") + pop1.side_effect = lambda key, period: ids + result1 = random(pop1) + + pop2 = create_mock_population(variable_name="my_variable") + pop2.side_effect = lambda key, period: ids + result2 = random(pop2) + + np.testing.assert_array_equal(result1, result2)