From b25f34e52522cf473f9f192ff56b80849261620a Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 00:15:58 +0000
Subject: [PATCH 1/7] Add baseline pipelined flow to H2D benchmark

---
 Ironwood/configs/host_device/host_device.yaml |  2 +-
 Ironwood/src/benchmark_host_device.py         | 92 ++++++++++++++-----
 2 files changed, 68 insertions(+), 26 deletions(-)

diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml
index 8d572ed7..ff97df1b 100644
--- a/Ironwood/configs/host_device/host_device.yaml
+++ b/Ironwood/configs/host_device/host_device.yaml
@@ -3,8 +3,8 @@ benchmarks:
   num_runs: 20
   benchmark_sweep_params:
   - {
+      h2d_type: ["simple", "pipelined"],
       data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768],
-      h2d_type: ["simple", "pipelined"]
     }
   csv_path: "../microbenchmarks/host_device"
   trace_dir: "../microbenchmarks/host_device/trace"
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 8a36a2c7..1d72b5eb 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -5,6 +5,7 @@
 from typing import Any, Dict, Tuple, List
 
 import jax
+from jax import numpy as jnp
 import numpy as np
 from benchmark_utils import MetricsStatistics
 
@@ -33,14 +34,9 @@ def benchmark_host_device(
     host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
     
     # Used in pipelined flow
+    # TODO: turn into a param
     num_devices_to_perform_h2d = 1
-    tensor_size = 4 * 1024 * 1024
-    target_device = jax.devices()[:num_devices_to_perform_h2d]
-    mesh = jax.sharding.Mesh(np.array(target_device), axis_names=["x"])
-    sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("x"))
-    pipelined_array = None
-    if h2d_type == "pipelined":
-        pipelined_array = np.random.normal(size=(tensor_size,)).astype(np.float32)
+    target_devices = jax.devices()[:num_devices_to_perform_h2d]
 
     print(
         f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}",
@@ -99,28 +95,74 @@ def benchmark_host_device(
                     
                     device_array.delete()
                 elif h2d_type == "pipelined":
+                    target_chunk_size_mib = 16  # Sweet spot from profiling
+                    num_devices = len(target_devices)
+
                     tensors_on_device = []
-                    if data_size_mib * 1024 * 1024 < pipelined_array.nbytes:
-                        print(f"Warning: {data_size_mib=} is smaller than pipeline unit, no data will be transferred.")
-                    t0 = time.perf_counter()
-                    # Assume data_size_mib is total across devices for now
-                    bytes_left = 1024 * 1024 * data_size_mib
-                    while bytes_left >= pipelined_array.nbytes:
-                        with jax.profiler.StepTraceAnnotation("device_put", step_num=1):
-                            x_device = jax.device_put(pipelined_array, sharding)
-                            tensors_on_device.append(x_device)
-                            bytes_left -= pipelined_array.nbytes
                     
-                    total_bytes_transferred = 0
-                    for tensor in tensors_on_device:
-                        tensor.block_until_ready()
-                        total_bytes_transferred += tensor.nbytes
-                        tensor.delete()
-                    t1 = time.perf_counter()
+                    # Calculate chunks per device
+                    data_per_dev = data_size_mib / num_devices
+                    chunks_per_dev = int(data_per_dev / target_chunk_size_mib)
+                    chunks_per_dev = max(1, chunks_per_dev)
+
+                    chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0)
 
+                    t0 = time.perf_counter()
+                    if chunks_per_dev > 1:    
+                        # We need to map chunks to the correct device
+                        # This simple example assumes chunks are perfectly divisible and ordered
+                        # In production, use `jax.sharding` mesh logic for complex layouts
+
+                        # approach 1: simple for loop
+                        for idx, chunk in enumerate(chunks):
+                            if num_devices > 1:
+                                dev = target_devices[idx % num_devices]
+                            else:
+                                dev = target_devices[0]
+                            tensors_on_device.append(jax.device_put(chunk, dev))
+                        # Re-assemble array
+                        result = jnp.vstack(tensors_on_device)
+                        # Wait for all chunks to be transferred
+                        result.block_until_ready()
+
+                        # approach 2: generator (slightly less overhead)
+                        # def chunk_generator(num_devices, chunks_per_dev):
+                        #     for n in range(chunks_per_dev):
+                        #         for d in range(num_devices):
+                        #             # 1. Get the specific small chunk
+                        #             chunk = chunks[d*chunks_per_dev+n]
+
+                        #             # 2. Trigger an individual DMA transfer for this specific chunk
+                        #             # This is where NUMA-local memory access matters
+                        #             yield jax.device_put(chunk, target_devices[d])
+
+                        # # Re-assemble array
+                        # result = jnp.vstack(list(chunk_generator(num_devices, chunks_per_dev)))
+                        # # Wait for all chunks to be transferred
+                        # result.block_until_ready()
+                    else:
+                        print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.")
+                        # Fallback to standard JAX put for small data
+                        result = jax.device_put(host_data, target_devices[0])
+                        result.block_until_ready()
+
+                    t1 = time.perf_counter()
                     h2d_perf.append((t1 - t0) * 1000)
-                    # Implement D2H at a later time after we establish H2D
-                    d2h_perf.append(0)
+
+                    # D2H
+                    t2 = time.perf_counter()
+                    # Simple device_get
+                    # Note: device_get returns a numpy array (copy)
+                    _ = jax.device_get(result)
+
+                    t3 = time.perf_counter()
+                    if not np.allclose(result, host_data):
+                        print("pipelined result not equal to host_data")
+                    d2h_perf.append((t3 - t2) * 1000)
+
+                    for r in tensors_on_device:
+                        r.delete()
+                    del tensors_on_device
 
     return {
         "H2D_Bandwidth_ms": h2d_perf,

From bbb316347a5277bcf2af5a829b0c4b25653f236d Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 02:52:30 +0000
Subject: [PATCH 2/7] Add --numactl_binding flag to host_device YAMLs

---
 Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml      | 2 +-
 Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
index a6b8febd..1084fdf9 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
@@ -53,7 +53,7 @@ spec:
           pip install -r requirements.txt
 
           GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} --numactl_binding
         resources:
           requests:
             google.com/tpu: 4
diff --git a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
index 8c027c01..83d06065 100644
--- a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
+++ b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
@@ -24,7 +24,7 @@ spec:
       cd accelerator-microbenchmarks
       pip install -r requirements.txt
 
-      bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml
+      bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml --numactl_binding
 
     resources:
       requests:

From bf5c79cf574359c52227938bcef47bc4d2392386 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 06:07:41 +0000
Subject: [PATCH 3/7] Add h2d_type column to H2D/D2H output

---
 Ironwood/src/benchmark_host_device.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 1d72b5eb..d80c2819 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -184,6 +184,7 @@ def benchmark_host_device_calculate_metrics(
     }
     metadata = {k: v for k, v in params if k in metadata_keys}
     metadata["dtype"] = "float32"
+    metadata["h2d_type"] = h2d_type
     
     metrics = {}
     

From 1fd18109656e1110e7128e095bf9a42ac64a6e7a Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 06:21:56 +0000
Subject: [PATCH 4/7] Revert "Add baseline pipelined flow to H2D benchmark"

This reverts commit a86475d6a50bb644617d6ee9e63427072526fc61.
---
 Ironwood/configs/host_device/host_device.yaml |   3 +-
 Ironwood/src/benchmark_host_device.py         | 126 ++++--------------
 2 files changed, 26 insertions(+), 103 deletions(-)

diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml
index ff97df1b..0b48800c 100644
--- a/Ironwood/configs/host_device/host_device.yaml
+++ b/Ironwood/configs/host_device/host_device.yaml
@@ -3,8 +3,7 @@ benchmarks:
   num_runs: 20
   benchmark_sweep_params:
   - {
-      h2d_type: ["simple", "pipelined"],
-      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768],
+      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]
     }
   csv_path: "../microbenchmarks/host_device"
   trace_dir: "../microbenchmarks/host_device/trace"
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index d80c2819..0c7eacc5 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, Tuple, List
 
 import jax
-from jax import numpy as jnp
+from jax import sharding
 import numpy as np
 from benchmark_utils import MetricsStatistics
 
@@ -23,9 +23,8 @@ def benchmark_host_device(
     data_size_mib: int,
     num_runs: int = 100,
     trace_dir: str = None,
-    h2d_type: str = "simple",
 ) -> Dict[str, Any]:
-    """Benchmarks H2D/D2H transfer using device_put/device_get."""
+    """Benchmarks H2D/D2H transfer using simple device_put/device_get."""
     
     num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize
     
@@ -33,13 +32,8 @@ def benchmark_host_device(
     column = 128
     host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
     
-    # Used in pipelined flow
-    # TODO: turn into a param
-    num_devices_to_perform_h2d = 1
-    target_devices = jax.devices()[:num_devices_to_perform_h2d]
-
     print(
-        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}",
+        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations",
         flush=True
     )
 
@@ -71,98 +65,29 @@ def benchmark_host_device(
             
             with step_context:
                  # H2D
-                if h2d_type == "simple":
-                    t0 = time.perf_counter()
-                    # Simple device_put
-                    device_array = jax.device_put(host_data)
-                    device_array.block_until_ready()
-                    t1 = time.perf_counter()
-                    
-                    # Verify H2D shape
-                    assert device_array.shape == host_data.shape
-
-                    h2d_perf.append((t1 - t0) * 1000)
+                t0 = time.perf_counter()
                 
-                    # D2H
-                    t2 = time.perf_counter()
-                    
-                    # Simple device_get
-                    # Note: device_get returns a numpy array (copy)
-                    _ = jax.device_get(device_array)
-                    
-                    t3 = time.perf_counter()
-                    d2h_perf.append((t3 - t2) * 1000)
-                    
-                    device_array.delete()
-                elif h2d_type == "pipelined":
-                    target_chunk_size_mib = 16  # Sweet spot from profiling
-                    num_devices = len(target_devices)
-
-                    tensors_on_device = []
-                    
-                    # Calculate chunks per device
-                    data_per_dev = data_size_mib / num_devices
-                    chunks_per_dev = int(data_per_dev / target_chunk_size_mib)
-                    chunks_per_dev = max(1, chunks_per_dev)
-
-                    chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0)
-
-                    t0 = time.perf_counter()
-                    if chunks_per_dev > 1:    
-                        # We need to map chunks to the correct device
-                        # This simple example assumes chunks are perfectly divisible and ordered
-                        # In production, use `jax.sharding` mesh logic for complex layouts
-
-                        # approach 1: simple for loop
-                        for idx, chunk in enumerate(chunks):
-                            if num_devices > 1:
-                                dev = target_devices[idx % num_devices]
-                            else:
-                                dev = target_devices[0]
-                            tensors_on_device.append(jax.device_put(chunk, dev))
-                        # Re-assemble array
-                        result = jnp.vstack(tensors_on_device)
-                        # Wait for all chunks to be transferred
-                        result.block_until_ready()
-
-                        # approach 2: generator (slightly less overhead)
-                        # def chunk_generator(num_devices, chunks_per_dev):
-                        #     for n in range(chunks_per_dev):
-                        #         for d in range(num_devices):
-                        #             # 1. Get the specific small chunk
-                        #             chunk = chunks[d*chunks_per_dev+n]
-
-                        #             # 2. Trigger an individual DMA transfer for this specific chunk
-                        #             # This is where NUMA-local memory access matters
-                        #             yield jax.device_put(chunk, target_devices[d])
-
-                        # # Re-assemble array
-                        # result = jnp.vstack(list(chunk_generator(num_devices, chunks_per_dev)))
-                        # # Wait for all chunks to be transferred
-                        # result.block_until_ready()
-                    else:
-                        print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.")
-                        # Fallback to standard JAX put for small data
-                        result = jax.device_put(host_data, target_devices[0])
-                        result.block_until_ready()
-
-                    t1 = time.perf_counter()
-                    h2d_perf.append((t1 - t0) * 1000)
-
-                    # D2H
-                    t2 = time.perf_counter()
-                    # Simple device_get
-                    # Note: device_get returns a numpy array (copy)
-                    _ = jax.device_get(result)
-
-                    t3 = time.perf_counter()
-                    if not np.allclose(result, host_data):
-                        print("pipelined result not equal to host_data")
-                    d2h_perf.append((t3 - t2) * 1000)
-
-                    for r in tensors_on_device:
-                        r.delete()
-                    del tensors_on_device
+                # Simple device_put
+                device_array = jax.device_put(host_data)
+                device_array.block_until_ready()
+                
+                t1 = time.perf_counter()
+                h2d_perf.append((t1 - t0) * 1000)
+                
+                # Verify H2D shape
+                assert device_array.shape == host_data.shape
+                
+                # D2H
+                t2 = time.perf_counter()
+                
+                # Simple device_get
+                # Note: device_get returns a numpy array (copy)
+                _ = jax.device_get(device_array)
+                
+                t3 = time.perf_counter()
+                d2h_perf.append((t3 - t2) * 1000)
+                
+                device_array.delete()
 
     return {
         "H2D_Bandwidth_ms": h2d_perf,
@@ -173,7 +98,6 @@ def benchmark_host_device_calculate_metrics(
     data_size_mib: int,
     H2D_Bandwidth_ms: List[float],
     D2H_Bandwidth_ms: List[float],
-    h2d_type: str = "simple",
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """Calculates metrics for Host-Device transfer."""
     params = locals().items()

From f0d75359157501454737db25d4ee194336142b7d Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 06:31:07 +0000
Subject: [PATCH 5/7] Revert "Add --numactl_binding flag to host_device YAMLs"

This reverts commit bbb316347a5277bcf2af5a829b0c4b25653f236d.
---
 Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml      | 2 +-
 Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
index 1084fdf9..a6b8febd 100644
--- a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
+++ b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml
@@ -53,7 +53,7 @@ spec:
           pip install -r requirements.txt
 
           GCS_BUCKET_DIR=${GCS_PATH}
-          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} --numactl_binding
+          python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR}
         resources:
           requests:
             google.com/tpu: 4
diff --git a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
index 83d06065..8c027c01 100644
--- a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
+++ b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
@@ -24,7 +24,7 @@ spec:
       cd accelerator-microbenchmarks
       pip install -r requirements.txt
 
-      bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml --numactl_binding
+      bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml
 
     resources:
       requests:

From 1592c089dfdeb74c667e27941de48174ab9df035 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 06:32:58 +0000
Subject: [PATCH 6/7] Revert "Add h2d_type column to H2D/D2H output"

This reverts commit bf5c79cf574359c52227938bcef47bc4d2392386.
---
 Ironwood/src/benchmark_host_device.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 0c7eacc5..16352e2a 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -108,7 +108,6 @@ def benchmark_host_device_calculate_metrics(
     }
     metadata = {k: v for k, v in params if k in metadata_keys}
     metadata["dtype"] = "float32"
-    metadata["h2d_type"] = h2d_type
     
     metrics = {}
     

From 0d6597fe8cb4c538cb974f8435fa95ae68a72903 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leoch@google.com>
Date: Thu, 5 Feb 2026 08:47:57 +0000
Subject: [PATCH 7/7] Revert "Revert the changes that were made for an urgent
 demo (#90)"

This reverts commit 3e4b59a0d8a5e87bea51f8c934895a4c9fda3ac5.
---
 Ironwood/configs/collectives/all_gather_1d.yaml        |  3 +--
 Ironwood/configs/collectives/all_gather_2d.yaml        |  3 +--
 Ironwood/configs/collectives/all_gather_3d.yaml        |  3 +--
 Ironwood/configs/collectives/all_gather_demo.yaml      | 10 +++-------
 .../configs/collectives/all_gather_tpu7x_2x2x1.yaml    |  3 +--
 .../configs/collectives/all_gather_tpu7x_2x2x2.yaml    |  3 +--
 .../configs/collectives/all_gather_tpu7x_2x2x4.yaml    |  3 +--
 .../configs/collectives/all_gather_tpu7x_2x4x4.yaml    |  3 +--
 .../configs/collectives/all_gather_tpu7x_4x4x4.yaml    |  5 ++---
 .../configs/collectives/all_gather_tpu7x_4x4x8.yaml    |  3 +--
 Ironwood/configs/collectives/all_reduce_1d.yaml        |  3 +--
 Ironwood/configs/collectives/all_reduce_2d.yaml        |  3 +--
 Ironwood/configs/collectives/all_reduce_3d.yaml        |  3 +--
 .../configs/collectives/all_reduce_tpu7x_2x2x1.yaml    |  3 +--
 .../configs/collectives/all_reduce_tpu7x_2x2x2.yaml    |  3 +--
 .../configs/collectives/all_reduce_tpu7x_2x2x4.yaml    |  3 +--
 .../configs/collectives/all_reduce_tpu7x_2x4x4.yaml    |  3 +--
 .../configs/collectives/all_reduce_tpu7x_4x4x4.yaml    |  3 +--
 .../configs/collectives/all_reduce_tpu7x_4x4x8.yaml    |  3 +--
 Ironwood/configs/collectives/all_to_all_1d.yaml        |  2 +-
 Ironwood/configs/collectives/all_to_all_2d.yaml        |  2 +-
 Ironwood/configs/collectives/all_to_all_3d.yaml        |  2 +-
 .../configs/collectives/all_to_all_tpu7x_2x2x1.yaml    |  3 +--
 .../configs/collectives/all_to_all_tpu7x_2x2x2.yaml    |  3 +--
 .../configs/collectives/all_to_all_tpu7x_2x2x4.yaml    |  3 +--
 .../configs/collectives/all_to_all_tpu7x_2x4x4.yaml    |  3 +--
 .../configs/collectives/all_to_all_tpu7x_4x4x4.yaml    |  3 +--
 .../configs/collectives/all_to_all_tpu7x_4x4x8.yaml    |  3 +--
 Ironwood/configs/collectives/reduce_scatter_1d.yaml    |  3 +--
 Ironwood/configs/collectives/reduce_scatter_2d.yaml    |  3 +--
 30 files changed, 33 insertions(+), 63 deletions(-)

diff --git a/Ironwood/configs/collectives/all_gather_1d.yaml b/Ironwood/configs/collectives/all_gather_1d.yaml
index 85d8fc3e..0b1313dc 100644
--- a/Ironwood/configs/collectives/all_gather_1d.yaml
+++ b/Ironwood/configs/collectives/all_gather_1d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_1d"
   csv_path: "../microbenchmarks/all_gather_1d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_1d"
diff --git a/Ironwood/configs/collectives/all_gather_2d.yaml b/Ironwood/configs/collectives/all_gather_2d.yaml
index 2d7a0e7a..c45f3e70 100644
--- a/Ironwood/configs/collectives/all_gather_2d.yaml
+++ b/Ironwood/configs/collectives/all_gather_2d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_2d"
   csv_path: "../microbenchmarks/all_gather_2d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_2d"
diff --git a/Ironwood/configs/collectives/all_gather_3d.yaml b/Ironwood/configs/collectives/all_gather_3d.yaml
index cc876a08..e159adfd 100644
--- a/Ironwood/configs/collectives/all_gather_3d.yaml
+++ b/Ironwood/configs/collectives/all_gather_3d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_3d"
   csv_path: "../microbenchmarks/all_gather_3d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_3d"
diff --git a/Ironwood/configs/collectives/all_gather_demo.yaml b/Ironwood/configs/collectives/all_gather_demo.yaml
index 6fb5a757..a9d776cd 100644
--- a/Ironwood/configs/collectives/all_gather_demo.yaml
+++ b/Ironwood/configs/collectives/all_gather_demo.yaml
@@ -1,13 +1,9 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} # Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1} # Non-Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2} # Non Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3} # Non Parallel Replica Groups
-
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1}
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2}
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3}
 
   warmup_tries: 10
   trace_dir: "../microbenchmarks/all_gather_demo"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
index 9bc586a1..0338aef1 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
index b5be0c8d..9253bac5 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
index 09b02979..9f8af67f 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
index 4f6cf11a..724fff00 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
index 77f3ed13..65189cc9 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
@@ -1,9 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
-  xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs"
\ No newline at end of file
+  xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
index 12743d61..77c4da6f 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_reduce_1d.yaml b/Ironwood/configs/collectives/all_reduce_1d.yaml
index 7b1d3068..d12d4221 100644
--- a/Ironwood/configs/collectives/all_reduce_1d.yaml
+++ b/Ironwood/configs/collectives/all_reduce_1d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_reduce_1d"
   csv_path: "../microbenchmarks/all_reduce_1d"
   xlml_metrics_dir: "../microbenchmarks/all_reduce_1d"
diff --git a/Ironwood/configs/collectives/all_reduce_2d.yaml b/Ironwood/configs/collectives/all_reduce_2d.yaml
index 93e1a7c9..5aa9654e 100644
--- a/Ironwood/configs/collectives/all_reduce_2d.yaml
+++ b/Ironwood/configs/collectives/all_reduce_2d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5}  # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5}
   trace_dir: "../microbenchmarks/all_reduce_2d"
   csv_path: "../microbenchmarks/all_reduce_2d"
   xlml_metrics_dir: "../microbenchmarks/all_reduce_2d"
diff --git a/Ironwood/configs/collectives/all_reduce_3d.yaml b/Ironwood/configs/collectives/all_reduce_3d.yaml
index f6a4ad9d..4e76b55f 100644
--- a/Ironwood/configs/collectives/all_reduce_3d.yaml
+++ b/Ironwood/configs/collectives/all_reduce_3d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5}
   trace_dir: "../microbenchmarks/all_reduce_3d"
   csv_path: "../microbenchmarks/all_reduce_3d"
   xlml_metrics_dir: "../microbenchmarks/all_reduce_3d"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
index f7389925..6d2d506c 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
index b2cb202c..d11981b0 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
index 946fd5ed..ab243b6f 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
index 613717cf..c731c622 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
index 3f4822c0..53d8dd3d 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
index a14bbfe8..f87878a4 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_to_all_1d.yaml b/Ironwood/configs/collectives/all_to_all_1d.yaml
index 3c28194d..8d222613 100644
--- a/Ironwood/configs/collectives/all_to_all_1d.yaml
+++ b/Ironwood/configs/collectives/all_to_all_1d.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_1d"
   csv_path: "../microbenchmarks/all_to_all_1d"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_1d"
diff --git a/Ironwood/configs/collectives/all_to_all_2d.yaml b/Ironwood/configs/collectives/all_to_all_2d.yaml
index b4a1bc0e..d23115fe 100644
--- a/Ironwood/configs/collectives/all_to_all_2d.yaml
+++ b/Ironwood/configs/collectives/all_to_all_2d.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_2d"
   csv_path: "../microbenchmarks/all_to_all_2d"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_2d"
diff --git a/Ironwood/configs/collectives/all_to_all_3d.yaml b/Ironwood/configs/collectives/all_to_all_3d.yaml
index 3aa0e2a7..c705754c 100644
--- a/Ironwood/configs/collectives/all_to_all_3d.yaml
+++ b/Ironwood/configs/collectives/all_to_all_3d.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_3d"
   csv_path: "../microbenchmarks/all_to_all_3d"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_3d"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
index 96da2c38..f9786b29 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
index 388a4468..b530a698 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
index e0cc48c9..86e3dbbc 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
index 5ae19b6e..6d4b79fb 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
index 4cc8f6bb..3460ddb6 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
index 212cd92d..93ef7cb7 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/reduce_scatter_1d.yaml b/Ironwood/configs/collectives/reduce_scatter_1d.yaml
index 9c2c0dea..063d73fc 100644
--- a/Ironwood/configs/collectives/reduce_scatter_1d.yaml
+++ b/Ironwood/configs/collectives/reduce_scatter_1d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum_scatter
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/reduce_scatter_1d"
   csv_path: "../microbenchmarks/reduce_scatter_1d"
   xlml_metrics_dir: "../microbenchmarks/reduce_scatter_1d"
diff --git a/Ironwood/configs/collectives/reduce_scatter_2d.yaml b/Ironwood/configs/collectives/reduce_scatter_2d.yaml
index f329b571..027ac991 100644
--- a/Ironwood/configs/collectives/reduce_scatter_2d.yaml
+++ b/Ironwood/configs/collectives/reduce_scatter_2d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum_scatter
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5}
   trace_dir: "../microbenchmarks/reduce_scatter_2d"
   csv_path: "../microbenchmarks/reduce_scatter_2d"
   xlml_metrics_dir: "../microbenchmarks/reduce_scatter_2d"