Refactor FailureManager to optimize Monte Carlo analysis

networmix · networmix · commit 8654e450724f · 2025-08-09T22:20:57.000+01:00
diff --git a/docs/reference/api-full.md b/docs/reference/api-full.md
@@ -12,7 +12,7 @@ Quick links:
 - [CLI Reference](cli.md)
 - [DSL Reference](dsl.md)
 
-Generated from source code on: August 09, 2025 at 13:04 UTC
+Generated from source code on: August 09, 2025 at 22:20 UTC
 
 Modules auto-discovered: 63
 
@@ -1982,7 +1982,7 @@ Attributes:
 - `compute_exclusions(self, policy: "'FailurePolicy | None'" = None, seed_offset: 'int | None' = None) -> 'tuple[set[str], set[str]]'` - Compute set of nodes and links to exclude for a failure iteration.
 - `create_network_view(self, excluded_nodes: 'set[str] | None' = None, excluded_links: 'set[str] | None' = None) -> 'NetworkView'` - Create NetworkView with specified exclusions.
 - `get_failure_policy(self) -> "'FailurePolicy | None'"` - Get failure policy for analysis.
-- `run_demand_placement_monte_carlo(self, demands_config: 'list[dict[str, Any]] | Any', iterations: 'int' = 100, parallelism: 'int' = 1, placement_rounds: 'int | str' = 'auto', baseline: 'bool' = False, seed: 'int | None' = None, store_failure_patterns: 'bool' = False, **kwargs) -> 'Any'` - Analyze traffic demand placement success under failures.
+- `run_demand_placement_monte_carlo(self, demands_config: 'list[dict[str, Any]] | Any', iterations: 'int' = 100, parallelism: 'int' = 1, placement_rounds: 'int | str' = 'auto', baseline: 'bool' = False, seed: 'int | None' = None, store_failure_patterns: 'bool' = False, include_flow_details: 'bool' = False, **kwargs) -> 'Any'` - Analyze traffic demand placement success under failures.
 - `run_max_flow_monte_carlo(self, source_path: 'str', sink_path: 'str', mode: 'str' = 'combine', iterations: 'int' = 100, parallelism: 'int' = 1, shortest_path: 'bool' = False, flow_placement: 'FlowPlacement | str' = <FlowPlacement.PROPORTIONAL: 1>, baseline: 'bool' = False, seed: 'int | None' = None, store_failure_patterns: 'bool' = False, include_flow_summary: 'bool' = False, **kwargs) -> 'Any'` - Analyze maximum flow capacity envelopes between node groups under failures.
 - `run_monte_carlo_analysis(self, analysis_func: 'AnalysisFunction', iterations: 'int' = 1, parallelism: 'int' = 1, baseline: 'bool' = False, seed: 'int | None' = None, store_failure_patterns: 'bool' = False, **analysis_kwargs) -> 'dict[str, Any]'` - Run Monte Carlo failure analysis with any analysis function.
 - `run_sensitivity_monte_carlo(self, source_path: 'str', sink_path: 'str', mode: 'str' = 'combine', iterations: 'int' = 100, parallelism: 'int' = 1, shortest_path: 'bool' = False, flow_placement: 'FlowPlacement | str' = <FlowPlacement.PROPORTIONAL: 1>, baseline: 'bool' = False, seed: 'int | None' = None, store_failure_patterns: 'bool' = False, **kwargs) -> 'Any'` - Analyze component criticality for flow capacity under failures.
@@ -2656,10 +2656,16 @@ YAML Configuration Example:
         baseline: true                     # Include baseline iteration first
         seed: 42                           # Optional reproducible seed
         store_failure_patterns: false      # Store failure patterns if needed
+        include_flow_details: true         # Collect per-demand cost distribution and edges
 
 Results stored in `scenario.results` under the step name:
 
-- placement_results: Per-iteration demand placement statistics (serializable)
+- placement_envelopes: Per-demand placement ratio envelopes with statistics
+
+      When ``include_flow_details`` is true, each envelope also includes
+      ``flow_summary_stats`` with aggregated ``cost_distribution_stats`` and
+      ``edge_usage_frequencies``.
+
 - failure_pattern_results: Failure pattern mapping (if requested)
 - metadata: Execution metadata (iterations, parallelism, baseline, etc.)
 
@@ -2676,6 +2682,9 @@ Attributes:
     baseline: Include baseline iteration without failures first.
     seed: Optional seed for reproducibility.
     store_failure_patterns: Whether to store failure pattern results.
+    include_flow_details: If True, collect per-demand cost distribution and
+        edges used per iteration, and aggregate into ``flow_summary_stats``
+        on each placement envelope.
 
 **Attributes:**
 
@@ -2688,6 +2697,7 @@ Attributes:
 - `placement_rounds` (int | str) = auto
 - `baseline` (bool) = False
 - `store_failure_patterns` (bool) = False
+- `include_flow_details` (bool) = False
 
 **Methods:**
 
@@ -3029,7 +3039,7 @@ failure analysis scenarios.
 Note: This module is distinct from ngraph.workflow.analysis, which provides
 notebook visualization components for workflow results.
 
-### demand_placement_analysis(network_view: "'NetworkView'", demands_config: 'list[dict[str, Any]]', placement_rounds: 'int | str' = 'auto', **kwargs) -> 'dict[str, Any]'
+### demand_placement_analysis(network_view: "'NetworkView'", demands_config: 'list[dict[str, Any]]', placement_rounds: 'int | str' = 'auto', include_flow_details: 'bool' = False, **kwargs) -> 'dict[str, Any]'
 
 Analyze traffic demand placement success rates.
 
@@ -3046,9 +3056,14 @@ Returns:
 - total_demand: Total demand volume.
 - overall_placement_ratio: total_placed / total_demand (0.0 if undefined).
 - demand_results: List of per-demand statistics preserving offered volume.
-- priority_results: Mapping from priority to aggregated statistics with keys
 
-      total_volume, placed_volume, unplaced_volume, placement_ratio,
+      When ``include_flow_details`` is True, each entry also includes
+      ``cost_distribution`` mapping path cost to placed volume and
+      ``edges_used`` as a list of edge identifiers seen in the placed flows.
+
+- priority_results: Mapping from priority to aggregated statistics with
+
+      keys total_volume, placed_volume, unplaced_volume, placement_ratio,
       and demand_count.
 
 ### max_flow_analysis(network_view: "'NetworkView'", source_regex: 'str', sink_regex: 'str', mode: 'str' = 'combine', shortest_path: 'bool' = False, flow_placement: 'FlowPlacement' = <FlowPlacement.PROPORTIONAL: 1>, include_flow_summary: 'bool' = False, **kwargs) -> 'list[tuple]'
diff --git a/ngraph/failure/manager/manager.py b/ngraph/failure/manager/manager.py
@@ -502,7 +502,11 @@ def run_monte_carlo_analysis(
         logger.debug("Pre-computing failure exclusions for all iterations")
         pre_compute_start = time.time()
 
-        worker_args = []
+        worker_args: list[tuple] = []
+        iteration_index_to_key: dict[int, tuple] = {}
+        key_to_first_arg: dict[tuple, tuple] = {}
+        key_to_members: dict[tuple, list[int]] = {}
+
         for i in range(mc_iters):
             seed_offset = None
             if seed is not None:
@@ -520,37 +524,87 @@ def run_monte_carlo_analysis(
                     policy, seed_offset
                 )
 
-            # Create worker arguments
-            worker_args.append(
-                (
-                    excluded_nodes,
-                    excluded_links,
-                    analysis_func,
-                    analysis_kwargs,
-                    i,  # iteration_index
-                    is_baseline,
-                    func_name,
-                )
+            arg = (
+                excluded_nodes,
+                excluded_links,
+                analysis_func,
+                analysis_kwargs,
+                i,  # iteration_index
+                is_baseline,
+                func_name,
+            )
+            worker_args.append(arg)
+
+            # Build deduplication key (excludes iteration index)
+            dedup_key = _create_cache_key(
+                excluded_nodes, excluded_links, func_name, analysis_kwargs
             )
+            iteration_index_to_key[i] = dedup_key
+            if dedup_key not in key_to_first_arg:
+                key_to_first_arg[dedup_key] = arg
+            key_to_members.setdefault(dedup_key, []).append(i)
 
         pre_compute_time = time.time() - pre_compute_start
         logger.debug(
             f"Pre-computed {len(worker_args)} exclusion sets in {pre_compute_time:.2f}s"
         )
 
+        # Prepare unique tasks (deduplicated by failure pattern + analysis params)
+        unique_worker_args: list[tuple] = list(key_to_first_arg.values())
+        num_unique_tasks: int = len(unique_worker_args)
+        logger.info(
+            f"Monte-Carlo deduplication: {num_unique_tasks} unique patterns from {mc_iters} iterations"
+        )
+
         # Determine if we should run in parallel
-        use_parallel = parallelism > 1 and mc_iters > 1
+        use_parallel = parallelism > 1 and num_unique_tasks > 1
 
         start_time = time.time()
 
+        # Execute only unique tasks, then replicate results to original indices
         if use_parallel:
-            results, failure_patterns = self._run_parallel(
-                worker_args, mc_iters, store_failure_patterns, parallelism
+            unique_result_values, _ = self._run_parallel(
+                unique_worker_args, num_unique_tasks, False, parallelism
             )
         else:
-            results, failure_patterns = self._run_serial(
-                worker_args, store_failure_patterns
+            unique_result_values, _ = self._run_serial(unique_worker_args, False)
+
+        # Map unique task results back to their groups by zipping args with results
+        key_to_result: dict[tuple, Any] = {}
+        for arg, value in zip(unique_worker_args, unique_result_values, strict=False):
+            exc_nodes, exc_links = arg[0], arg[1]
+            dedup_key = _create_cache_key(
+                exc_nodes, exc_links, func_name, analysis_kwargs
             )
+            key_to_result[dedup_key] = value
+
+        # Build full results list in original order
+        results: list[Any] = [None] * mc_iters  # type: ignore[var-annotated]
+        for key, members in key_to_members.items():
+            if key not in key_to_result:
+                # Defensive: should not happen unless parallel map returned fewer tasks
+                continue
+            value = key_to_result[key]
+            for idx in members:
+                results[idx] = value
+
+        # Reconstruct failure patterns per original iteration if requested
+        failure_patterns: list[dict[str, Any]] = []
+        if store_failure_patterns:
+            for key, members in key_to_members.items():
+                # Use exclusions from the representative arg
+                rep_arg = key_to_first_arg[key]
+                exc_nodes: set[str] = rep_arg[0]
+                exc_links: set[str] = rep_arg[1]
+                for idx in members:
+                    failure_patterns.append(
+                        {
+                            "iteration_index": idx,
+                            "is_baseline": bool(baseline and idx == 0),
+                            "excluded_nodes": list(exc_nodes),
+                            "excluded_links": list(exc_links),
+                        }
+                    )
 
         elapsed_time = time.time() - start_time
 
@@ -564,19 +618,14 @@ def run_monte_carlo_analysis(
                 "analysis_function": func_name,
                 "policy_name": self.policy_name,
                 "execution_time": elapsed_time,
-                "unique_patterns": len(
-                    set(
-                        (tuple(sorted(args[0])), tuple(sorted(args[1])))
-                        for args in worker_args
-                    )
-                ),
+                "unique_patterns": num_unique_tasks,
             },
         }
 
     def _run_parallel(
         self,
         worker_args: list[tuple],
-        mc_iters: int,
+        total_tasks: int,
         store_failure_patterns: bool,
         parallelism: int,
     ) -> tuple[list[Any], list[dict[str, Any]]]:
@@ -596,17 +645,17 @@ def _run_parallel(
         Returns:
             Tuple of (results_list, failure_patterns_list).
         """
-        workers = min(parallelism, mc_iters)
+        workers = min(parallelism, total_tasks)
         logger.info(
-            f"Running parallel analysis with {workers} workers for {mc_iters} iterations"
+            f"Running parallel analysis with {workers} workers for {total_tasks} iterations"
         )
 
         # Serialize network once for all workers
         network_pickle = pickle.dumps(self.network)
         logger.debug(f"Serialized network once: {len(network_pickle)} bytes")
 
         # Calculate optimal chunksize to minimize IPC overhead
-        chunksize = max(1, mc_iters // (workers * 4))
+        chunksize = max(1, total_tasks // (workers * 4))
         logger.debug(f"Using chunksize={chunksize} for parallel execution")
 
         start_time = time.time()
@@ -622,7 +671,7 @@ def _run_parallel(
             logger.debug(
                 f"ProcessPoolExecutor created with {workers} workers and shared network"
             )
-            logger.info(f"Starting parallel execution of {mc_iters} iterations")
+            logger.info(f"Starting parallel execution of {total_tasks} iterations")
 
             try:
                 for (
@@ -649,9 +698,9 @@ def _run_parallel(
                         )
 
                     # Progress logging
-                    if completed_tasks % max(1, mc_iters // 10) == 0:
+                    if completed_tasks % max(1, total_tasks // 10) == 0:
                         logger.info(
-                            f"Parallel analysis progress: {completed_tasks}/{mc_iters} tasks completed"
+                            f"Parallel analysis progress: {completed_tasks}/{total_tasks} tasks completed"
                         )
 
             except Exception as e:
@@ -664,7 +713,7 @@ def _run_parallel(
         elapsed_time = time.time() - start_time
         logger.info(f"Parallel analysis completed in {elapsed_time:.2f} seconds")
         logger.debug(
-            f"Average time per iteration: {elapsed_time / mc_iters:.3f} seconds"
+            f"Average time per iteration: {elapsed_time / total_tasks:.3f} seconds"
         )
 
         # Log exclusion pattern diversity for cache efficiency analysis
@@ -678,9 +727,9 @@ def _run_parallel(
             unique_exclusions.add(exclusion_key)
 
         logger.info(
-            f"Generated {len(unique_exclusions)} unique exclusion patterns from {mc_iters} iterations"
+            f"Generated {len(unique_exclusions)} unique exclusion patterns from {total_tasks} iterations"
         )
-        cache_efficiency = (mc_iters - len(unique_exclusions)) / mc_iters * 100
+        cache_efficiency = (total_tasks - len(unique_exclusions)) / total_tasks * 100
         logger.debug(
             f"Potential cache efficiency: {cache_efficiency:.1f}% (worker processes benefit from caching)"
         )
diff --git a/scenarios/square_mesh.yaml b/scenarios/square_mesh.yaml
@@ -60,8 +60,8 @@ workflow:
   # Single pairwise analysis generates complete 4x4 node-to-node capacity matrix
   - step_type: CapacityEnvelopeAnalysis
     name: "node_to_node_capacity_matrix"
-    source_path: "^N([1-4])$"        # Capturing group creates separate groups: "1", "2", "3", "4"
-    sink_path: "^N([1-4])$"          # Capturing group creates separate groups: "1", "2", "3", "4"
+    source_path: "^(N[1-4])$"        # Capturing group creates separate groups: "1", "2", "3", "4"
+    sink_path: "^(N[1-4])$"          # Capturing group creates separate groups: "1", "2", "3", "4"
     mode: "pairwise"                 # Generates all source-sink combinations (16 total)
     failure_policy: "single_link_failure"
     iterations: 10                   # Monte Carlo iterations per node pair
diff --git a/tests/failure/test_manager.py b/tests/failure/test_manager.py
@@ -279,20 +279,15 @@ def test_parallel_execution(
         mock_pool_executor: MagicMock,
         failure_manager: FailureManager,
     ) -> None:
+        """When deduplication collapses iterations to one unique pattern, execution
+        may run serially even if parallelism > 1. Validate results shape and metadata
+        without asserting executor usage.
+        """
         mock_pickle.dumps.return_value = b"fake_network_data"
 
         mock_pool = MagicMock()
         mock_pool_executor.return_value.__enter__.return_value = mock_pool
 
-        mock_results = [
-            [("src1", "dst1", 100.0)],
-            [("src2", "dst2", 200.0)],
-        ]
-        mock_pool.map.return_value = [
-            (mock_results[0], 0, False, set(), set()),
-            (mock_results[1], 1, False, set(), set()),
-        ]
-
         result = failure_manager.run_monte_carlo_analysis(
             analysis_func=mock_analysis_func,
             iterations=2,
@@ -301,7 +296,6 @@ def test_parallel_execution(
 
         assert len(result["results"]) == 2
         assert result["metadata"]["parallelism"] == 2
-        mock_pool_executor.assert_called_once()
 
 
 class TestFailureManagerTopLevelMatching:
@@ -397,15 +391,26 @@ class TestFailureManagerErrorHandling:
     def test_run_monte_carlo_parallel_execution_error(
         self, failure_manager: FailureManager
     ) -> None:
+        """Force two unique patterns so the parallel path is taken, then assert
+        errors in worker execution propagate.
+        """
         with patch(
             "ngraph.failure.manager.manager.ProcessPoolExecutor"
         ) as mock_pool_executor:
             mock_pool = MagicMock()
             mock_pool_executor.return_value.__enter__.return_value = mock_pool
             mock_pool.map.side_effect = RuntimeError("Parallel execution failed")
 
-            with patch(
-                "ngraph.failure.manager.manager.pickle.dumps", return_value=b"fake_data"
+            with (
+                patch(
+                    "ngraph.failure.manager.manager.pickle.dumps",
+                    return_value=b"fake_data",
+                ),
+                patch.object(
+                    failure_manager,
+                    "compute_exclusions",
+                    side_effect=[({"n1"}, set()), ({"n2"}, set())],
+                ),
             ):
                 with pytest.raises(RuntimeError, match="Parallel execution failed"):
                     failure_manager.run_monte_carlo_analysis(