diff --git a/Ironwood/configs/collectives/all_gather_1d.yaml b/Ironwood/configs/collectives/all_gather_1d.yaml index 85d8fc3e..0b1313dc 100644 --- a/Ironwood/configs/collectives/all_gather_1d.yaml +++ b/Ironwood/configs/collectives/all_gather_1d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_1d" csv_path: "../microbenchmarks/all_gather_1d" xlml_metrics_dir: "../microbenchmarks/all_gather_1d" diff --git a/Ironwood/configs/collectives/all_gather_2d.yaml b/Ironwood/configs/collectives/all_gather_2d.yaml index 2d7a0e7a..c45f3e70 100644 --- a/Ironwood/configs/collectives/all_gather_2d.yaml +++ b/Ironwood/configs/collectives/all_gather_2d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_2d" csv_path: "../microbenchmarks/all_gather_2d" xlml_metrics_dir: "../microbenchmarks/all_gather_2d" diff --git a/Ironwood/configs/collectives/all_gather_3d.yaml b/Ironwood/configs/collectives/all_gather_3d.yaml index cc876a08..e159adfd 100644 --- a/Ironwood/configs/collectives/all_gather_3d.yaml +++ b/Ironwood/configs/collectives/all_gather_3d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_3d" csv_path: "../microbenchmarks/all_gather_3d" xlml_metrics_dir: "../microbenchmarks/all_gather_3d" diff --git a/Ironwood/configs/collectives/all_gather_demo.yaml b/Ironwood/configs/collectives/all_gather_demo.yaml index 6fb5a757..a9d776cd 100644 --- a/Ironwood/configs/collectives/all_gather_demo.yaml +++ b/Ironwood/configs/collectives/all_gather_demo.yaml @@ -1,13 +1,9 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} # Parallel Replica - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1} # Non-Parallel Replica - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2} # Non Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3} # Non Parallel Replica Groups - + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} + - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} warmup_tries: 10 trace_dir: "../microbenchmarks/all_gather_demo" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml index 9bc586a1..0338aef1 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml index b5be0c8d..9253bac5 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml index 09b02979..9f8af67f 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml index 4f6cf11a..724fff00 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml index 77f3ed13..65189cc9 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml @@ -1,9 +1,8 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10} trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4" - xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs" \ No newline at end of file + xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs" diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml index 12743d61..77c4da6f 100644 --- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_gather benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_reduce_1d.yaml b/Ironwood/configs/collectives/all_reduce_1d.yaml index 7b1d3068..d12d4221 100644 --- a/Ironwood/configs/collectives/all_reduce_1d.yaml +++ b/Ironwood/configs/collectives/all_reduce_1d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_reduce_1d" csv_path: "../microbenchmarks/all_reduce_1d" xlml_metrics_dir: "../microbenchmarks/all_reduce_1d" diff --git a/Ironwood/configs/collectives/all_reduce_2d.yaml b/Ironwood/configs/collectives/all_reduce_2d.yaml index 93e1a7c9..5aa9654e 100644 --- a/Ironwood/configs/collectives/all_reduce_2d.yaml +++ b/Ironwood/configs/collectives/all_reduce_2d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} trace_dir: "../microbenchmarks/all_reduce_2d" csv_path: "../microbenchmarks/all_reduce_2d" xlml_metrics_dir: "../microbenchmarks/all_reduce_2d" diff --git a/Ironwood/configs/collectives/all_reduce_3d.yaml b/Ironwood/configs/collectives/all_reduce_3d.yaml index f6a4ad9d..4e76b55f 100644 --- a/Ironwood/configs/collectives/all_reduce_3d.yaml +++ b/Ironwood/configs/collectives/all_reduce_3d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} trace_dir: "../microbenchmarks/all_reduce_3d" csv_path: "../microbenchmarks/all_reduce_3d" xlml_metrics_dir: "../microbenchmarks/all_reduce_3d" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml index f7389925..6d2d506c 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1" csv_path: "../microbenchmarks/psum_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml index b2cb202c..d11981b0 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2" csv_path: "../microbenchmarks/psum_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml index 946fd5ed..ab243b6f 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4" csv_path: "../microbenchmarks/psum_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml index 613717cf..c731c622 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4" csv_path: "../microbenchmarks/psum_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml index 3f4822c0..53d8dd3d 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4" csv_path: "../microbenchmarks/psum_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml index a14bbfe8..f87878a4 100644 --- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8" csv_path: "../microbenchmarks/psum_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/all_to_all_1d.yaml b/Ironwood/configs/collectives/all_to_all_1d.yaml index 3c28194d..8d222613 100644 --- a/Ironwood/configs/collectives/all_to_all_1d.yaml +++ b/Ironwood/configs/collectives/all_to_all_1d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_1d" csv_path: "../microbenchmarks/all_to_all_1d" xlml_metrics_dir: "../microbenchmarks/all_to_all_1d" diff --git a/Ironwood/configs/collectives/all_to_all_2d.yaml b/Ironwood/configs/collectives/all_to_all_2d.yaml index b4a1bc0e..d23115fe 100644 --- a/Ironwood/configs/collectives/all_to_all_2d.yaml +++ b/Ironwood/configs/collectives/all_to_all_2d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_2d" csv_path: "../microbenchmarks/all_to_all_2d" xlml_metrics_dir: "../microbenchmarks/all_to_all_2d" diff --git a/Ironwood/configs/collectives/all_to_all_3d.yaml b/Ironwood/configs/collectives/all_to_all_3d.yaml index 3aa0e2a7..c705754c 100644 --- a/Ironwood/configs/collectives/all_to_all_3d.yaml +++ b/Ironwood/configs/collectives/all_to_all_3d.yaml @@ -1,7 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_3d" csv_path: "../microbenchmarks/all_to_all_3d" xlml_metrics_dir: "../microbenchmarks/all_to_all_3d" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml index 96da2c38..f9786b29 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml index 388a4468..b530a698 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml index e0cc48c9..86e3dbbc 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml index 5ae19b6e..6d4b79fb 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml index 4cc8f6bb..3460ddb6 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4" diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml index 212cd92d..93ef7cb7 100644 --- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml +++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: all_to_all benchmark_sweep_params: - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32", mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8" xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8" diff --git a/Ironwood/configs/collectives/reduce_scatter_1d.yaml b/Ironwood/configs/collectives/reduce_scatter_1d.yaml index 9c2c0dea..063d73fc 100644 --- a/Ironwood/configs/collectives/reduce_scatter_1d.yaml +++ b/Ironwood/configs/collectives/reduce_scatter_1d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum_scatter benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} trace_dir: "../microbenchmarks/reduce_scatter_1d" csv_path: "../microbenchmarks/reduce_scatter_1d" xlml_metrics_dir: "../microbenchmarks/reduce_scatter_1d" diff --git a/Ironwood/configs/collectives/reduce_scatter_2d.yaml b/Ironwood/configs/collectives/reduce_scatter_2d.yaml index f329b571..027ac991 100644 --- a/Ironwood/configs/collectives/reduce_scatter_2d.yaml +++ b/Ironwood/configs/collectives/reduce_scatter_2d.yaml @@ -1,8 +1,7 @@ benchmarks: - benchmark_name: psum_scatter benchmark_sweep_params: - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups - - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups + - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32", mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} trace_dir: "../microbenchmarks/reduce_scatter_2d" csv_path: "../microbenchmarks/reduce_scatter_2d" xlml_metrics_dir: "../microbenchmarks/reduce_scatter_2d" diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml index 8d572ed7..0b48800c 100644 --- a/Ironwood/configs/host_device/host_device.yaml +++ b/Ironwood/configs/host_device/host_device.yaml @@ -3,8 +3,7 @@ benchmarks: num_runs: 20 benchmark_sweep_params: - { - data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768], - h2d_type: ["simple", "pipelined"] + data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] } csv_path: "../microbenchmarks/host_device" trace_dir: "../microbenchmarks/host_device/trace" diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py index 8a36a2c7..16352e2a 100644 --- a/Ironwood/src/benchmark_host_device.py +++ b/Ironwood/src/benchmark_host_device.py @@ -5,6 +5,7 @@ from typing import Any, Dict, Tuple, List import jax +from jax import sharding import numpy as np from benchmark_utils import MetricsStatistics @@ -22,9 +23,8 @@ def benchmark_host_device( data_size_mib: int, num_runs: int = 100, trace_dir: str = None, - h2d_type: str = "simple", ) -> Dict[str, Any]: - """Benchmarks H2D/D2H transfer using device_put/device_get.""" + """Benchmarks H2D/D2H transfer using simple device_put/device_get.""" num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize @@ -32,18 +32,8 @@ def benchmark_host_device( column = 128 host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32) - # Used in pipelined flow - num_devices_to_perform_h2d = 1 - tensor_size = 4 * 1024 * 1024 - target_device = jax.devices()[:num_devices_to_perform_h2d] - mesh = jax.sharding.Mesh(np.array(target_device), axis_names=["x"]) - sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("x")) - pipelined_array = None - if h2d_type == "pipelined": - pipelined_array = np.random.normal(size=(tensor_size,)).astype(np.float32) - print( - f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}", + f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations", flush=True ) @@ -75,52 +65,29 @@ def benchmark_host_device( with step_context: # H2D - if h2d_type == "simple": - t0 = time.perf_counter() - # Simple device_put - device_array = jax.device_put(host_data) - device_array.block_until_ready() - t1 = time.perf_counter() - - # Verify H2D shape - assert device_array.shape == host_data.shape - - h2d_perf.append((t1 - t0) * 1000) + t0 = time.perf_counter() - # D2H - t2 = time.perf_counter() - - # Simple device_get - # Note: device_get returns a numpy array (copy) - _ = jax.device_get(device_array) - - t3 = time.perf_counter() - d2h_perf.append((t3 - t2) * 1000) - - device_array.delete() - elif h2d_type == "pipelined": - tensors_on_device = [] - if data_size_mib * 1024 * 1024 < pipelined_array.nbytes: - print(f"Warning: {data_size_mib=} is smaller than pipeline unit, no data will be transferred.") - t0 = time.perf_counter() - # Assume data_size_mib is total across devices for now - bytes_left = 1024 * 1024 * data_size_mib - while bytes_left >= pipelined_array.nbytes: - with jax.profiler.StepTraceAnnotation("device_put", step_num=1): - x_device = jax.device_put(pipelined_array, sharding) - tensors_on_device.append(x_device) - bytes_left -= pipelined_array.nbytes - - total_bytes_transferred = 0 - for tensor in tensors_on_device: - tensor.block_until_ready() - total_bytes_transferred += tensor.nbytes - tensor.delete() - t1 = time.perf_counter() - - h2d_perf.append((t1 - t0) * 1000) - # Implement D2H at a later time after we establish H2D - d2h_perf.append(0) + # Simple device_put + device_array = jax.device_put(host_data) + device_array.block_until_ready() + + t1 = time.perf_counter() + h2d_perf.append((t1 - t0) * 1000) + + # Verify H2D shape + assert device_array.shape == host_data.shape + + # D2H + t2 = time.perf_counter() + + # Simple device_get + # Note: device_get returns a numpy array (copy) + _ = jax.device_get(device_array) + + t3 = time.perf_counter() + d2h_perf.append((t3 - t2) * 1000) + + device_array.delete() return { "H2D_Bandwidth_ms": h2d_perf, @@ -131,7 +98,6 @@ def benchmark_host_device_calculate_metrics( data_size_mib: int, H2D_Bandwidth_ms: List[float], D2H_Bandwidth_ms: List[float], - h2d_type: str = "simple", ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Calculates metrics for Host-Device transfer.""" params = locals().items()