From b25f34e52522cf473f9f192ff56b80849261620a Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 00:15:58 +0000 Subject: [PATCH 1/7] Add baseline pipelined flow to H2D benchmark --- Ironwood/configs/host_device/host_device.yaml | 2 +- Ironwood/src/benchmark_host_device.py | 92 ++++++++++++++----- 2 files changed, 68 insertions(+), 26 deletions(-) diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml index 8d572ed7..ff97df1b 100644 --- a/Ironwood/configs/host_device/host_device.yaml +++ b/Ironwood/configs/host_device/host_device.yaml @@ -3,8 +3,8 @@ benchmarks: num_runs: 20 benchmark_sweep_params: - { + h2d_type: ["simple", "pipelined"], data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768], - h2d_type: ["simple", "pipelined"] } csv_path: "../microbenchmarks/host_device" trace_dir: "../microbenchmarks/host_device/trace" diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 8a36a2c7..1d72b5eb 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -5,6 +5,7 @@ from typing import Any, Dict, Tuple, List import jax +from jax import numpy as jnp import numpy as np from benchmark_utils import MetricsStatistics @@ -33,14 +34,9 @@ def benchmark_host_device( host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32) # Used in pipelined flow + # TODO: turn into a param num_devices_to_perform_h2d = 1 - tensor_size = 4 * 1024 * 1024 - target_device = jax.devices()[:num_devices_to_perform_h2d] - mesh = jax.sharding.Mesh(np.array(target_device), axis_names=["x"]) - sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("x")) - pipelined_array = None - if h2d_type == "pipelined": - pipelined_array = np.random.normal(size=(tensor_size,)).astype(np.float32) + target_devices = jax.devices()[:num_devices_to_perform_h2d] print( f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}", @@ -99,28 +95,74 @@ def benchmark_host_device( device_array.delete() elif h2d_type == "pipelined": + target_chunk_size_mib = 16 # Sweet spot from profiling + num_devices = len(target_devices) + tensors_on_device = [] - if data_size_mib * 1024 * 1024 < pipelined_array.nbytes: - print(f"Warning: {data_size_mib=} is smaller than pipeline unit, no data will be transferred.") - t0 = time.perf_counter() - # Assume data_size_mib is total across devices for now - bytes_left = 1024 * 1024 * data_size_mib - while bytes_left >= pipelined_array.nbytes: - with jax.profiler.StepTraceAnnotation("device_put", step_num=1): - x_device = jax.device_put(pipelined_array, sharding) - tensors_on_device.append(x_device) - bytes_left -= pipelined_array.nbytes - total_bytes_transferred = 0 - for tensor in tensors_on_device: - tensor.block_until_ready() - total_bytes_transferred += tensor.nbytes - tensor.delete() - t1 = time.perf_counter() + # Calculate chunks per device + data_per_dev = data_size_mib / num_devices + chunks_per_dev = int(data_per_dev / target_chunk_size_mib) + chunks_per_dev = max(1, chunks_per_dev) + + chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0) + t0 = time.perf_counter() + if chunks_per_dev > 1: + # We need to map chunks to the correct device + # This simple example assumes chunks are perfectly divisible and ordered + # In production, use `jax.sharding` mesh logic for complex layouts + + # approach 1: simple for loop + for idx, chunk in enumerate(chunks): + if num_devices > 1: + dev = target_devices[idx % num_devices] + else: + dev = target_devices[0] + tensors_on_device.append(jax.device_put(chunk, dev)) + # Re-assemble array + result = jnp.vstack(tensors_on_device) + # Wait for all chunks to be transferred + result.block_until_ready() + + # approach 2: generator (slightly less overhead) + # def chunk_generator(num_devices, chunks_per_dev): + # for n in range(chunks_per_dev): + # for d in range(num_devices): + # # 1. Get the specific small chunk + # chunk = chunks[d*chunks_per_dev+n] + + # # 2. Trigger an individual DMA transfer for this specific chunk + # # This is where NUMA-local memory access matters + # yield jax.device_put(chunk, target_devices[d]) + + # # Re-assemble array + # result = jnp.vstack(list(chunk_generator(num_devices, chunks_per_dev))) + # # Wait for all chunks to be transferred + # result.block_until_ready() + else: + print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.") + # Fallback to standard JAX put for small data + result = jax.device_put(host_data, target_devices[0]) + result.block_until_ready() + + t1 = time.perf_counter() h2d_perf.append((t1 - t0) * 1000) - # Implement D2H at a later time after we establish H2D - d2h_perf.append(0) + + # D2H + t2 = time.perf_counter() + # Simple device_get + # Note: device_get returns a numpy array (copy) + _ = jax.device_get(result) + + t3 = time.perf_counter() + if not np.allclose(result, host_data): + print("pipelined result not equal to host_data") + d2h_perf.append((t3 - t2) * 1000) + + for r in tensors_on_device: + r.delete() + del tensors_on_device return { "H2D_Bandwidth_ms": h2d_perf, From bbb316347a5277bcf2af5a829b0c4b25653f236d Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 02:52:30 +0000 Subject: [PATCH 2/7] Add --numactl_binding flag to host_device YAMLs --- Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml | 2 +- Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml index a6b8febd..1084fdf9 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml @@ -53,7 +53,7 @@ spec: pip install -r requirements.txt GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} --numactl_binding resources: requests: google.com/tpu: 4 diff --git a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml index 8c027c01..83d06065 100644 --- a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml +++ b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml @@ -24,7 +24,7 @@ spec: cd accelerator-microbenchmarks pip install -r requirements.txt - bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml + bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml --numactl_binding resources: requests: From bf5c79cf574359c52227938bcef47bc4d2392386 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 06:07:41 +0000 Subject: [PATCH 3/7] Add h2d_type column to H2D/D2H output --- Ironwood/src/benchmark_host_device.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 1d72b5eb..d80c2819 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -184,6 +184,7 @@ def benchmark_host_device_calculate_metrics( } metadata = {k: v for k, v in params if k in metadata_keys} metadata["dtype"] = "float32" + metadata["h2d_type"] = h2d_type metrics = {} From 1fd18109656e1110e7128e095bf9a42ac64a6e7a Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 06:21:56 +0000 Subject: [PATCH 4/7] Revert "Add baseline pipelined flow to H2D benchmark" This reverts commit a86475d6a50bb644617d6ee9e63427072526fc61. --- Ironwood/configs/host_device/host_device.yaml | 3 +- Ironwood/src/benchmark_host_device.py | 126 ++++-------------- 2 files changed, 26 insertions(+), 103 deletions(-) diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml index ff97df1b..0b48800c 100644 --- a/Ironwood/configs/host_device/host_device.yaml +++ b/Ironwood/configs/host_device/host_device.yaml @@ -3,8 +3,7 @@ benchmarks: num_runs: 20 benchmark_sweep_params: - { - h2d_type: ["simple", "pipelined"], - data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768], + data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] } csv_path: "../microbenchmarks/host_device" trace_dir: "../microbenchmarks/host_device/trace" diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index d80c2819..0c7eacc5 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -5,7 +5,7 @@ from typing import Any, Dict, Tuple, List import jax -from jax import numpy as jnp +from jax import sharding import numpy as np from benchmark_utils import MetricsStatistics @@ -23,9 +23,8 @@ def benchmark_host_device( data_size_mib: int, num_runs: int = 100, trace_dir: str = None, - h2d_type: str = "simple", ) -> Dict[str, Any]: - """Benchmarks H2D/D2H transfer using device_put/device_get.""" + """Benchmarks H2D/D2H transfer using simple device_put/device_get.""" num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize @@ -33,13 +32,8 @@ def benchmark_host_device( column = 128 host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32) - # Used in pipelined flow - # TODO: turn into a param - num_devices_to_perform_h2d = 1 - target_devices = jax.devices()[:num_devices_to_perform_h2d] - print( - f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}", + f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations", flush=True ) @@ -71,98 +65,29 @@ def benchmark_host_device( with step_context: # H2D - if h2d_type == "simple": - t0 = time.perf_counter() - # Simple device_put - device_array = jax.device_put(host_data) - device_array.block_until_ready() - t1 = time.perf_counter() - - # Verify H2D shape - assert device_array.shape == host_data.shape - - h2d_perf.append((t1 - t0) * 1000) + t0 = time.perf_counter() - # D2H - t2 = time.perf_counter() - - # Simple device_get - # Note: device_get returns a numpy array (copy) - _ = jax.device_get(device_array) - - t3 = time.perf_counter() - d2h_perf.append((t3 - t2) * 1000) - - device_array.delete() - elif h2d_type == "pipelined": - target_chunk_size_mib = 16 # Sweet spot from profiling - num_devices = len(target_devices) - - tensors_on_device = [] - - # Calculate chunks per device - data_per_dev = data_size_mib / num_devices - chunks_per_dev = int(data_per_dev / target_chunk_size_mib) - chunks_per_dev = max(1, chunks_per_dev) - - chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0) - - t0 = time.perf_counter() - if chunks_per_dev > 1: - # We need to map chunks to the correct device - # This simple example assumes chunks are perfectly divisible and ordered - # In production, use `jax.sharding` mesh logic for complex layouts - - # approach 1: simple for loop - for idx, chunk in enumerate(chunks): - if num_devices > 1: - dev = target_devices[idx % num_devices] - else: - dev = target_devices[0] - tensors_on_device.append(jax.device_put(chunk, dev)) - # Re-assemble array - result = jnp.vstack(tensors_on_device) - # Wait for all chunks to be transferred - result.block_until_ready() - - # approach 2: generator (slightly less overhead) - # def chunk_generator(num_devices, chunks_per_dev): - # for n in range(chunks_per_dev): - # for d in range(num_devices): - # # 1. Get the specific small chunk - # chunk = chunks[d*chunks_per_dev+n] - - # # 2. Trigger an individual DMA transfer for this specific chunk - # # This is where NUMA-local memory access matters - # yield jax.device_put(chunk, target_devices[d]) - - # # Re-assemble array - # result = jnp.vstack(list(chunk_generator(num_devices, chunks_per_dev))) - # # Wait for all chunks to be transferred - # result.block_until_ready() - else: - print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.") - # Fallback to standard JAX put for small data - result = jax.device_put(host_data, target_devices[0]) - result.block_until_ready() - - t1 = time.perf_counter() - h2d_perf.append((t1 - t0) * 1000) - - # D2H - t2 = time.perf_counter() - # Simple device_get - # Note: device_get returns a numpy array (copy) - _ = jax.device_get(result) - - t3 = time.perf_counter() - if not np.allclose(result, host_data): - print("pipelined result not equal to host_data") - d2h_perf.append((t3 - t2) * 1000) - - for r in tensors_on_device: - r.delete() - del tensors_on_device + # Simple device_put + device_array = jax.device_put(host_data) + device_array.block_until_ready() + + t1 = time.perf_counter() + h2d_perf.append((t1 - t0) * 1000) + + # Verify H2D shape + assert device_array.shape == host_data.shape + + # D2H + t2 = time.perf_counter() + + # Simple device_get + # Note: device_get returns a numpy array (copy) + _ = jax.device_get(device_array) + + t3 = time.perf_counter() + d2h_perf.append((t3 - t2) * 1000) + + device_array.delete() return { "H2D_Bandwidth_ms": h2d_perf, @@ -173,7 +98,6 @@ def benchmark_host_device_calculate_metrics( data_size_mib: int, H2D_Bandwidth_ms: List[float], D2H_Bandwidth_ms: List[float], - h2d_type: str = "simple", ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Calculates metrics for Host-Device transfer.""" params = locals().items() From f0d75359157501454737db25d4ee194336142b7d Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 06:31:07 +0000 Subject: [PATCH 5/7] Revert "Add --numactl_binding flag to host_device YAMLs" This reverts commit bbb316347a5277bcf2af5a829b0c4b25653f236d. --- Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml | 2 +- Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml index 1084fdf9..a6b8febd 100644 --- a/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml +++ b/Ironwood/guides/automation/tpu7x-2x2x1-host_device.yaml @@ -53,7 +53,7 @@ spec: pip install -r requirements.txt GCS_BUCKET_DIR=${GCS_PATH} - python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} --numactl_binding + python Ironwood/src/run_benchmark.py --config="Ironwood/configs/host_device/host_device.yaml" --gcs-bucket-csv-dir=${GCS_BUCKET_DIR} resources: requests: google.com/tpu: 4 diff --git a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml index 83d06065..8c027c01 100644 --- a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml +++ b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml @@ -24,7 +24,7 @@ spec: cd accelerator-microbenchmarks pip install -r requirements.txt - bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml --numactl_binding + bash ./Ironwood/scripts/run_host_device_benchmark.sh --config Ironwood/configs/host_device/host_device.yaml resources: requests: From 1592c089dfdeb74c667e27941de48174ab9df035 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 06:32:58 +0000 Subject: [PATCH 6/7] Revert "Add h2d_type column to H2D/D2H output" This reverts commit bf5c79cf574359c52227938bcef47bc4d2392386. --- Ironwood/src/benchmark_host_device.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 0c7eacc5..16352e2a 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -108,7 +108,6 @@ def benchmark_host_device_calculate_metrics( } metadata = {k: v for k, v in params if k in metadata_keys} metadata["dtype"] = "float32" - metadata["h2d_type"] = h2d_type metrics = {} From 0d6597fe8cb4c538cb974f8435fa95ae68a72903 Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 5 Feb 2026 08:47:57 +0000 Subject: [PATCH 7/7] Revert "Revert the changes that were made for an urgent demo (#90)" This reverts commit 3e4b59a0d8a5e87bea51f8c934895a4c9fda3ac5. --- Ironwood/configs/collectives/all_gather_1d.yaml | 3 +-- Ironwood/configs/collectives/all_gather_2d.yaml | 3 +-- Ironwood/configs/collectives/all_gather_3d.yaml | 3 +-- Ironwood/configs/collectives/all_gather_demo.yaml | 10 +++------- .../configs/collectives/all_gather_tpu7x_2x2x1.yaml | 3 +-- .../configs/collectives/all_gather_tpu7x_2x2x2.yaml | 3 +-- .../configs/collectives/all_gather_tpu7x_2x2x4.yaml | 3 +-- .../configs/collectives/all_gather_tpu7x_2x4x4.yaml | 3 +-- .../configs/collectives/all_gather_tpu7x_4x4x4.yaml | 5 ++--- .../configs/collectives/all_gather_tpu7x_4x4x8.yaml | 3 +-- Ironwood/configs/collectives/all_reduce_1d.yaml | 3 +-- Ironwood/configs/collectives/all_reduce_2d.yaml | 3 +-- Ironwood/configs/collectives/all_reduce_3d.yaml | 3 +-- .../configs/collectives/all_reduce_tpu7x_2x2x1.yaml | 3 +-- .../configs/collectives/all_reduce_tpu7x_2x2x2.yaml | 3 +-- .../configs/collectives/all_reduce_tpu7x_2x2x4.yaml | 3 +-- .../configs/collectives/all_reduce_tpu7x_2x4x4.yaml | 3 +-- .../configs/collectives/all_reduce_tpu7x_4x4x4.yaml | 3 +-- .../configs/collectives/all_reduce_tpu7x_4x4x8.yaml | 3 +-- Ironwood/configs/collectives/all_to_all_1d.yaml | 2 +- Ironwood/configs/collectives/all_to_all_2d.yaml | 2 +- Ironwood/configs/collectives/all_to_all_3d.yaml | 2 +- .../configs/collectives/all_to_all_tpu7x_2x2x1.yaml | 3 +-- .../configs/collectives/all_to_all_tpu7x_2x2x2.yaml | 3 +-- .../configs/collectives/all_to_all_tpu7x_2x2x4.yaml | 3 +-- .../configs/collectives/all_to_all_tpu7x_2x4x4.yaml | 3 +-- .../configs/collectives/all_to_all_tpu7x_4x4x4.yaml | 3 +-- .../configs/collectives/all_to_all_tpu7x_4x4x8.yaml | 3 +-- Ironwood/configs/collectives/reduce_scatter_1d.yaml | 3 +-- Ironwood/configs/collectives/reduce_scatter_2d.yaml | 3 +-- 30 files changed, 33 insertions(+), 63 deletions(-) diff --git a/Ironwood/configs/collectives/all_gather_1d.yaml b/Ironwood/configs/collectives/all_gather_1d.yaml index 85d8fc3e..0b1313dc 100644 --- a/Ironwood/configs/collectives/all_gather_1d.yaml +++ b/Ironwood/configs/collectives/all_gather_1d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_1d" csv_path: "../microbenchmarks/all_gather_1d" xlml_metrics_dir: "../microbenchmarks/all_gather_1d" diff --git a/Ironwood/configs/collectives/all_gather_2d.yaml b/Ironwood/configs/collectives/all_gather_2d.yaml index 2d7a0e7a..c45f3e70 100644 --- a/Ironwood/configs/collectives/all_gather_2d.yaml +++ b/Ironwood/configs/collectives/all_gather_2d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_2d" csv_path: "../microbenchmarks/all_gather_2d" xlml_metrics_dir: "../microbenchmarks/all_gather_2d" diff --git a/Ironwood/configs/collectives/all_gather_3d.yaml b/Ironwood/configs/collectives/all_gather_3d.yaml index cc876a08..e159adfd 100644 --- a/Ironwood/configs/collectives/all_gather_3d.yaml +++ b/Ironwood/configs/collectives/all_gather_3d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_3d" csv_path: "../microbenchmarks/all_gather_3d" xlml_metrics_dir: "../microbenchmarks/all_gather_3d" diff --git a/Ironwood/configs/collectives/all_gather_demo.yaml b/Ironwood/configs/collectives/all_gather_demo.yaml index 6fb5a757..a9d776cd 100644 --- a/Ironwood/configs/collectives/all_gather_demo.yaml +++ b/Ironwood/configs/collectives/all_gather_demo.yaml @@ -1,13 +1,9 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} # Parallel Replica - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1} # Non-Parallel Replica - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2} # Non Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3} # Non Parallel Replica Groups - + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} warmup_tries: 10 trace_dir: "../microbenchmarks/all_gather_demo" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml index 9bc586a1..0338aef1 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml index b5be0c8d..9253bac5 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml index 09b02979..9f8af67f 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml index 4f6cf11a..724fff00 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml index 77f3ed13..65189cc9 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml @@ -1,9 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10} trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" - xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs" \ No newline at end of file + xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml index 12743d61..77c4da6f 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_reduce_1d.yaml b/Ironwood/configs/collectives/all_reduce_1d.yaml index 7b1d3068..d12d4221 100644 --- a/Ironwood/configs/collectives/all_reduce_1d.yaml +++ b/Ironwood/configs/collectives/all_reduce_1d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_reduce_1d" csv_path: "../microbenchmarks/all_reduce_1d" xlml_metrics_dir: "../microbenchmarks/all_reduce_1d" diff --git a/Ironwood/configs/collectives/all_reduce_2d.yaml b/Ironwood/configs/collectives/all_reduce_2d.yaml index 93e1a7c9..5aa9654e 100644 --- a/Ironwood/configs/collectives/all_reduce_2d.yaml +++ b/Ironwood/configs/collectives/all_reduce_2d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} trace_dir: "../microbenchmarks/all_reduce_2d" csv_path: "../microbenchmarks/all_reduce_2d" xlml_metrics_dir: "../microbenchmarks/all_reduce_2d" diff --git a/Ironwood/configs/collectives/all_reduce_3d.yaml b/Ironwood/configs/collectives/all_reduce_3d.yaml index f6a4ad9d..4e76b55f 100644 --- a/Ironwood/configs/collectives/all_reduce_3d.yaml +++ b/Ironwood/configs/collectives/all_reduce_3d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} trace_dir: "../microbenchmarks/all_reduce_3d" csv_path: "../microbenchmarks/all_reduce_3d" xlml_metrics_dir: "../microbenchmarks/all_reduce_3d" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml index f7389925..6d2d506c 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1" csv_path: "../microbenchmarks/psum_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml index b2cb202c..d11981b0 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2" csv_path: "../microbenchmarks/psum_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml index 946fd5ed..ab243b6f 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4" csv_path: "../microbenchmarks/psum_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml index 613717cf..c731c622 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4" csv_path: "../microbenchmarks/psum_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml index 3f4822c0..53d8dd3d 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4" csv_path: "../microbenchmarks/psum_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml index a14bbfe8..f87878a4 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8" csv_path: "../microbenchmarks/psum_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_to_all_1d.yaml b/Ironwood/configs/collectives/all_to_all_1d.yaml index 3c28194d..8d222613 100644 --- a/Ironwood/configs/collectives/all_to_all_1d.yaml +++ b/Ironwood/configs/collectives/all_to_all_1d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_1d" csv_path: "../microbenchmarks/all_to_all_1d" xlml_metrics_dir: "../microbenchmarks/all_to_all_1d" diff --git a/Ironwood/configs/collectives/all_to_all_2d.yaml b/Ironwood/configs/collectives/all_to_all_2d.yaml index b4a1bc0e..d23115fe 100644 --- a/Ironwood/configs/collectives/all_to_all_2d.yaml +++ b/Ironwood/configs/collectives/all_to_all_2d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_2d" csv_path: "../microbenchmarks/all_to_all_2d" xlml_metrics_dir: "../microbenchmarks/all_to_all_2d" diff --git a/Ironwood/configs/collectives/all_to_all_3d.yaml b/Ironwood/configs/collectives/all_to_all_3d.yaml index 3aa0e2a7..c705754c 100644 --- a/Ironwood/configs/collectives/all_to_all_3d.yaml +++ b/Ironwood/configs/collectives/all_to_all_3d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_3d" csv_path: "../microbenchmarks/all_to_all_3d" xlml_metrics_dir: "../microbenchmarks/all_to_all_3d" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml index 96da2c38..f9786b29 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml index 388a4468..b530a698 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml index e0cc48c9..86e3dbbc 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml index 5ae19b6e..6d4b79fb 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml index 4cc8f6bb..3460ddb6 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml index 212cd92d..93ef7cb7 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/reduce_scatter_1d.yaml b/Ironwood/configs/collectives/reduce_scatter_1d.yaml index 9c2c0dea..063d73fc 100644 --- a/Ironwood/configs/collectives/reduce_scatter_1d.yaml +++ b/Ironwood/configs/collectives/reduce_scatter_1d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum_scatter benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/reduce_scatter_1d" csv_path: "../microbenchmarks/reduce_scatter_1d" xlml_metrics_dir: "../microbenchmarks/reduce_scatter_1d" diff --git a/Ironwood/configs/collectives/reduce_scatter_2d.yaml b/Ironwood/configs/collectives/reduce_scatter_2d.yaml index f329b571..027ac991 100644 --- a/Ironwood/configs/collectives/reduce_scatter_2d.yaml +++ b/Ironwood/configs/collectives/reduce_scatter_2d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum_scatter benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} trace_dir: "../microbenchmarks/reduce_scatter_2d" csv_path: "../microbenchmarks/reduce_scatter_2d" xlml_metrics_dir: "../microbenchmarks/reduce_scatter_2d"