From 75f7ed60aeedaa96249645105633900a18fa0b97 Mon Sep 17 00:00:00 2001 From: Chi Shuen Lee Date: Tue, 27 Jan 2026 11:04:15 +0800 Subject: [PATCH] Revert the changes that were made for an urgent demo --- Ironwood/configs/collectives/all_gather_1d.yaml | 3 ++- Ironwood/configs/collectives/all_gather_2d.yaml | 3 ++- Ironwood/configs/collectives/all_gather_3d.yaml | 3 ++- Ironwood/configs/collectives/all_gather_demo.yaml | 10 +++++++--- .../configs/collectives/all_gather_tpu7x_2x2x1.yaml | 3 ++- .../configs/collectives/all_gather_tpu7x_2x2x2.yaml | 3 ++- .../configs/collectives/all_gather_tpu7x_2x2x4.yaml | 3 ++- .../configs/collectives/all_gather_tpu7x_2x4x4.yaml | 3 ++- .../configs/collectives/all_gather_tpu7x_4x4x4.yaml | 5 +++-- .../configs/collectives/all_gather_tpu7x_4x4x8.yaml | 3 ++- Ironwood/configs/collectives/all_reduce_1d.yaml | 3 ++- Ironwood/configs/collectives/all_reduce_2d.yaml | 3 ++- Ironwood/configs/collectives/all_reduce_3d.yaml | 3 ++- .../configs/collectives/all_reduce_tpu7x_2x2x1.yaml | 3 ++- .../configs/collectives/all_reduce_tpu7x_2x2x2.yaml | 3 ++- .../configs/collectives/all_reduce_tpu7x_2x2x4.yaml | 3 ++- .../configs/collectives/all_reduce_tpu7x_2x4x4.yaml | 3 ++- .../configs/collectives/all_reduce_tpu7x_4x4x4.yaml | 3 ++- .../configs/collectives/all_reduce_tpu7x_4x4x8.yaml | 3 ++- Ironwood/configs/collectives/all_to_all_1d.yaml | 2 +- Ironwood/configs/collectives/all_to_all_2d.yaml | 2 +- Ironwood/configs/collectives/all_to_all_3d.yaml | 2 +- .../configs/collectives/all_to_all_tpu7x_2x2x1.yaml | 3 ++- .../configs/collectives/all_to_all_tpu7x_2x2x2.yaml | 3 ++- .../configs/collectives/all_to_all_tpu7x_2x2x4.yaml | 3 ++- .../configs/collectives/all_to_all_tpu7x_2x4x4.yaml | 3 ++- .../configs/collectives/all_to_all_tpu7x_4x4x4.yaml | 3 ++- .../configs/collectives/all_to_all_tpu7x_4x4x8.yaml | 3 ++- Ironwood/configs/collectives/reduce_scatter_1d.yaml | 3 ++- Ironwood/configs/collectives/reduce_scatter_2d.yaml | 3 ++- 30 files changed, 63 insertions(+), 33 deletions(-) diff --git a/Ironwood/configs/collectives/all_gather_1d.yaml b/Ironwood/configs/collectives/all_gather_1d.yaml index 0b1313dc..85d8fc3e 100644 --- a/Ironwood/configs/collectives/all_gather_1d.yaml +++ b/Ironwood/configs/collectives/all_gather_1d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_1d" csv_path: "../microbenchmarks/all_gather_1d" xlml_metrics_dir: "../microbenchmarks/all_gather_1d" diff --git a/Ironwood/configs/collectives/all_gather_2d.yaml b/Ironwood/configs/collectives/all_gather_2d.yaml index c45f3e70..2d7a0e7a 100644 --- a/Ironwood/configs/collectives/all_gather_2d.yaml +++ b/Ironwood/configs/collectives/all_gather_2d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups trace_dir: "../microbenchmarks/all_gather_2d" csv_path: "../microbenchmarks/all_gather_2d" xlml_metrics_dir: "../microbenchmarks/all_gather_2d" diff --git a/Ironwood/configs/collectives/all_gather_3d.yaml b/Ironwood/configs/collectives/all_gather_3d.yaml index e159adfd..cc876a08 100644 --- a/Ironwood/configs/collectives/all_gather_3d.yaml +++ b/Ironwood/configs/collectives/all_gather_3d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups trace_dir: "../microbenchmarks/all_gather_3d" csv_path: "../microbenchmarks/all_gather_3d" xlml_metrics_dir: "../microbenchmarks/all_gather_3d" diff --git a/Ironwood/configs/collectives/all_gather_demo.yaml b/Ironwood/configs/collectives/all_gather_demo.yaml index a9d776cd..6fb5a757 100644 --- a/Ironwood/configs/collectives/all_gather_demo.yaml +++ b/Ironwood/configs/collectives/all_gather_demo.yaml @@ -1,9 +1,13 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} # Parallel Replica + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1} # Non-Parallel Replica + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3} # Non Parallel Replica Groups + warmup_tries: 10 trace_dir: "../microbenchmarks/all_gather_demo" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml index 0338aef1..9bc586a1 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml index 9253bac5..b5be0c8d 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml index 9f8af67f..09b02979 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml index 724fff00..4f6cf11a 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml index 65189cc9..77f3ed13 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml @@ -1,8 +1,9 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" - xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs" + xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs" \ No newline at end of file diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml index 77c4da6f..12743d61 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_reduce_1d.yaml b/Ironwood/configs/collectives/all_reduce_1d.yaml index d12d4221..7b1d3068 100644 --- a/Ironwood/configs/collectives/all_reduce_1d.yaml +++ b/Ironwood/configs/collectives/all_reduce_1d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica trace_dir: "../microbenchmarks/all_reduce_1d" csv_path: "../microbenchmarks/all_reduce_1d" xlml_metrics_dir: "../microbenchmarks/all_reduce_1d" diff --git a/Ironwood/configs/collectives/all_reduce_2d.yaml b/Ironwood/configs/collectives/all_reduce_2d.yaml index 5aa9654e..93e1a7c9 100644 --- a/Ironwood/configs/collectives/all_reduce_2d.yaml +++ b/Ironwood/configs/collectives/all_reduce_2d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups trace_dir: "../microbenchmarks/all_reduce_2d" csv_path: "../microbenchmarks/all_reduce_2d" xlml_metrics_dir: "../microbenchmarks/all_reduce_2d" diff --git a/Ironwood/configs/collectives/all_reduce_3d.yaml b/Ironwood/configs/collectives/all_reduce_3d.yaml index 4e76b55f..f6a4ad9d 100644 --- a/Ironwood/configs/collectives/all_reduce_3d.yaml +++ b/Ironwood/configs/collectives/all_reduce_3d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups trace_dir: "../microbenchmarks/all_reduce_3d" csv_path: "../microbenchmarks/all_reduce_3d" xlml_metrics_dir: "../microbenchmarks/all_reduce_3d" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml index 6d2d506c..f7389925 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1" csv_path: "../microbenchmarks/psum_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml index d11981b0..b2cb202c 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2" csv_path: "../microbenchmarks/psum_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml index ab243b6f..946fd5ed 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4" csv_path: "../microbenchmarks/psum_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml index c731c622..613717cf 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4" csv_path: "../microbenchmarks/psum_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml index 53d8dd3d..3f4822c0 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4" csv_path: "../microbenchmarks/psum_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml index f87878a4..a14bbfe8 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8" csv_path: "../microbenchmarks/psum_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_to_all_1d.yaml b/Ironwood/configs/collectives/all_to_all_1d.yaml index 8d222613..3c28194d 100644 --- a/Ironwood/configs/collectives/all_to_all_1d.yaml +++ b/Ironwood/configs/collectives/all_to_all_1d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica trace_dir: "../microbenchmarks/all_to_all_1d" csv_path: "../microbenchmarks/all_to_all_1d" xlml_metrics_dir: "../microbenchmarks/all_to_all_1d" diff --git a/Ironwood/configs/collectives/all_to_all_2d.yaml b/Ironwood/configs/collectives/all_to_all_2d.yaml index d23115fe..b4a1bc0e 100644 --- a/Ironwood/configs/collectives/all_to_all_2d.yaml +++ b/Ironwood/configs/collectives/all_to_all_2d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica trace_dir: "../microbenchmarks/all_to_all_2d" csv_path: "../microbenchmarks/all_to_all_2d" xlml_metrics_dir: "../microbenchmarks/all_to_all_2d" diff --git a/Ironwood/configs/collectives/all_to_all_3d.yaml b/Ironwood/configs/collectives/all_to_all_3d.yaml index c705754c..3aa0e2a7 100644 --- a/Ironwood/configs/collectives/all_to_all_3d.yaml +++ b/Ironwood/configs/collectives/all_to_all_3d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups trace_dir: "../microbenchmarks/all_to_all_3d" csv_path: "../microbenchmarks/all_to_all_3d" xlml_metrics_dir: "../microbenchmarks/all_to_all_3d" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml index f9786b29..96da2c38 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml index b530a698..388a4468 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml index 86e3dbbc..e0cc48c9 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml index 6d4b79fb..5ae19b6e 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml index 3460ddb6..4cc8f6bb 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml index 93ef7cb7..212cd92d 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/reduce_scatter_1d.yaml b/Ironwood/configs/collectives/reduce_scatter_1d.yaml index 063d73fc..9c2c0dea 100644 --- a/Ironwood/configs/collectives/reduce_scatter_1d.yaml +++ b/Ironwood/configs/collectives/reduce_scatter_1d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum_scatter benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica trace_dir: "../microbenchmarks/reduce_scatter_1d" csv_path: "../microbenchmarks/reduce_scatter_1d" xlml_metrics_dir: "../microbenchmarks/reduce_scatter_1d" diff --git a/Ironwood/configs/collectives/reduce_scatter_2d.yaml b/Ironwood/configs/collectives/reduce_scatter_2d.yaml index 027ac991..f329b571 100644 --- a/Ironwood/configs/collectives/reduce_scatter_2d.yaml +++ b/Ironwood/configs/collectives/reduce_scatter_2d.yaml @@ -1,7 +1,8 @@ benchmarks: - benchmark_name: psum_scatter benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups trace_dir: "../microbenchmarks/reduce_scatter_2d" csv_path: "../microbenchmarks/reduce_scatter_2d" xlml_metrics_dir: "../microbenchmarks/reduce_scatter_2d"