diff --git a/Ironwood/configs/collectives/all_gather_1d.yaml b/Ironwood/configs/collectives/all_gather_1d.yaml
index 85d8fc3e..0b1313dc 100644
--- a/Ironwood/configs/collectives/all_gather_1d.yaml
+++ b/Ironwood/configs/collectives/all_gather_1d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_1d"
   csv_path: "../microbenchmarks/all_gather_1d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_1d"
diff --git a/Ironwood/configs/collectives/all_gather_2d.yaml b/Ironwood/configs/collectives/all_gather_2d.yaml
index 2d7a0e7a..c45f3e70 100644
--- a/Ironwood/configs/collectives/all_gather_2d.yaml
+++ b/Ironwood/configs/collectives/all_gather_2d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_2d"
   csv_path: "../microbenchmarks/all_gather_2d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_2d"
diff --git a/Ironwood/configs/collectives/all_gather_3d.yaml b/Ironwood/configs/collectives/all_gather_3d.yaml
index cc876a08..e159adfd 100644
--- a/Ironwood/configs/collectives/all_gather_3d.yaml
+++ b/Ironwood/configs/collectives/all_gather_3d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_3d"
   csv_path: "../microbenchmarks/all_gather_3d"
   xlml_metrics_dir: "../microbenchmarks/all_gather_3d"
diff --git a/Ironwood/configs/collectives/all_gather_demo.yaml b/Ironwood/configs/collectives/all_gather_demo.yaml
index 6fb5a757..a9d776cd 100644
--- a/Ironwood/configs/collectives/all_gather_demo.yaml
+++ b/Ironwood/configs/collectives/all_gather_demo.yaml
@@ -1,13 +1,9 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1} # Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1} # Non-Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2} # Non Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3} # Non Parallel Replica Groups
-
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1}
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2}
+  - {matrix_dim_range: {start: 2, end: 512, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3}
 
   warmup_tries: 10
   trace_dir: "../microbenchmarks/all_gather_demo"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
index 9bc586a1..0338aef1 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x1.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
index b5be0c8d..9253bac5 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x2.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
index 09b02979..9f8af67f 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x2x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
index 4f6cf11a..724fff00 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_2x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
index 77f3ed13..65189cc9 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x4.yaml
@@ -1,9 +1,8 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 10}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4"
-  xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs"
\ No newline at end of file
+  xla_dump_dir: "../microbenchmarks/all_gather_tpu7x_4x4x4/hlo_graphs"
diff --git a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
index 12743d61..77c4da6f 100644
--- a/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_gather_tpu7x_4x4x8.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_gather
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_gather_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_gather_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_reduce_1d.yaml b/Ironwood/configs/collectives/all_reduce_1d.yaml
index 7b1d3068..d12d4221 100644
--- a/Ironwood/configs/collectives/all_reduce_1d.yaml
+++ b/Ironwood/configs/collectives/all_reduce_1d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1" , op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_reduce_1d"
   csv_path: "../microbenchmarks/all_reduce_1d"
   xlml_metrics_dir: "../microbenchmarks/all_reduce_1d"
diff --git a/Ironwood/configs/collectives/all_reduce_2d.yaml b/Ironwood/configs/collectives/all_reduce_2d.yaml
index 93e1a7c9..5aa9654e 100644
--- a/Ironwood/configs/collectives/all_reduce_2d.yaml
+++ b/Ironwood/configs/collectives/all_reduce_2d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5}  # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5}
   trace_dir: "../microbenchmarks/all_reduce_2d"
   csv_path: "../microbenchmarks/all_reduce_2d"
   xlml_metrics_dir: "../microbenchmarks/all_reduce_2d"
diff --git a/Ironwood/configs/collectives/all_reduce_3d.yaml b/Ironwood/configs/collectives/all_reduce_3d.yaml
index f6a4ad9d..4e76b55f 100644
--- a/Ironwood/configs/collectives/all_reduce_3d.yaml
+++ b/Ironwood/configs/collectives/all_reduce_3d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "64x2", ici_size_range: 128, sharding_strategy: "64x1", op_dimension: 3, num_runs: 5}
   trace_dir: "../microbenchmarks/all_reduce_3d"
   csv_path: "../microbenchmarks/all_reduce_3d"
   xlml_metrics_dir: "../microbenchmarks/all_reduce_3d"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
index f7389925..6d2d506c 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x1.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
index b2cb202c..d11981b0 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x2.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
index 946fd5ed..ab243b6f 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x2x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
index 613717cf..c731c622 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_2x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
index 3f4822c0..53d8dd3d 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
index a14bbfe8..f87878a4 100644
--- a/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_reduce_tpu7x_4x4x8.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/psum_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/psum_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/all_to_all_1d.yaml b/Ironwood/configs/collectives/all_to_all_1d.yaml
index 3c28194d..8d222613 100644
--- a/Ironwood/configs/collectives/all_to_all_1d.yaml
+++ b/Ironwood/configs/collectives/all_to_all_1d.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_1d"
   csv_path: "../microbenchmarks/all_to_all_1d"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_1d"
diff --git a/Ironwood/configs/collectives/all_to_all_2d.yaml b/Ironwood/configs/collectives/all_to_all_2d.yaml
index b4a1bc0e..d23115fe 100644
--- a/Ironwood/configs/collectives/all_to_all_2d.yaml
+++ b/Ironwood/configs/collectives/all_to_all_2d.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_2d"
   csv_path: "../microbenchmarks/all_to_all_2d"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_2d"
diff --git a/Ironwood/configs/collectives/all_to_all_3d.yaml b/Ironwood/configs/collectives/all_to_all_3d.yaml
index 3aa0e2a7..c705754c 100644
--- a/Ironwood/configs/collectives/all_to_all_3d.yaml
+++ b/Ironwood/configs/collectives/all_to_all_3d.yaml
@@ -1,7 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "4x4x8", op_dimension: 3, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_3d"
   csv_path: "../microbenchmarks/all_to_all_3d"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_3d"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
index 96da2c38..f9786b29 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x1.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "2x2x2", ici_size_range: 8, sharding_strategy: "2x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x1"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
index 388a4468..b530a698 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x2.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 8, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x2x2", ici_size_range: 16, sharding_strategy: "4x2x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x2"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
index e0cc48c9..86e3dbbc 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x2x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 16, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x2", ici_size_range: 32, sharding_strategy: "4x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x2x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
index 5ae19b6e..6d4b79fb 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_2x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 32, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "8x4x2", ici_size_range: 64, sharding_strategy: "8x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_2x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
index 4cc8f6bb..3460ddb6 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x4.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 64, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "16x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x4"
diff --git a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
index 212cd92d..93ef7cb7 100644
--- a/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
+++ b/Ironwood/configs/collectives/all_to_all_tpu7x_4x4x8.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: all_to_all
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x2", op_dimension: 1,  num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 128, end: 16384, multiplier: 2}, dtype: "float32",  mesh_shape: "32x4x2", ici_size_range: 256, sharding_strategy: "32x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   csv_path: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
   xlml_metrics_dir: "../microbenchmarks/all_to_all_tpu7x_4x4x8"
diff --git a/Ironwood/configs/collectives/reduce_scatter_1d.yaml b/Ironwood/configs/collectives/reduce_scatter_1d.yaml
index 9c2c0dea..063d73fc 100644
--- a/Ironwood/configs/collectives/reduce_scatter_1d.yaml
+++ b/Ironwood/configs/collectives/reduce_scatter_1d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum_scatter
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5} # Parallel Replica
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x4x8", ici_size_range: 128, sharding_strategy: "1x1x4", op_dimension: 1, num_runs: 5} # Non-Parallel Replica
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "16x4x2", ici_size_range: 128, sharding_strategy: "1x4x1", op_dimension: 1, num_runs: 5}
   trace_dir: "../microbenchmarks/reduce_scatter_1d"
   csv_path: "../microbenchmarks/reduce_scatter_1d"
   xlml_metrics_dir: "../microbenchmarks/reduce_scatter_1d"
diff --git a/Ironwood/configs/collectives/reduce_scatter_2d.yaml b/Ironwood/configs/collectives/reduce_scatter_2d.yaml
index f329b571..027ac991 100644
--- a/Ironwood/configs/collectives/reduce_scatter_2d.yaml
+++ b/Ironwood/configs/collectives/reduce_scatter_2d.yaml
@@ -1,8 +1,7 @@
 benchmarks:
 - benchmark_name: psum_scatter
   benchmark_sweep_params:
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5} # Parallel Replica Groups
-  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x32", ici_size_range: 128, sharding_strategy: "1x32", op_dimension: 2, num_runs: 5} # Non Parallel Replica Groups
+  - {matrix_dim_range: {start: 2, end: 8192, multiplier: 2}, dtype: "float32",  mesh_shape: "4x16x2", ici_size_range: 128, sharding_strategy: "1x16x1", op_dimension: 2, num_runs: 5}
   trace_dir: "../microbenchmarks/reduce_scatter_2d"
   csv_path: "../microbenchmarks/reduce_scatter_2d"
   xlml_metrics_dir: "../microbenchmarks/reduce_scatter_2d"
diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml
index 8d572ed7..0b48800c 100644
--- a/Ironwood/configs/host_device/host_device.yaml
+++ b/Ironwood/configs/host_device/host_device.yaml
@@ -3,8 +3,7 @@ benchmarks:
   num_runs: 20
   benchmark_sweep_params:
   - {
-      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768],
-      h2d_type: ["simple", "pipelined"]
+      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]
     }
   csv_path: "../microbenchmarks/host_device"
   trace_dir: "../microbenchmarks/host_device/trace"
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
index 8a36a2c7..16352e2a 100644
--- a/Ironwood/src/benchmark_host_device.py
+++ b/Ironwood/src/benchmark_host_device.py
@@ -5,6 +5,7 @@
 from typing import Any, Dict, Tuple, List
 
 import jax
+from jax import sharding
 import numpy as np
 from benchmark_utils import MetricsStatistics
 
@@ -22,9 +23,8 @@ def benchmark_host_device(
     data_size_mib: int,
     num_runs: int = 100,
     trace_dir: str = None,
-    h2d_type: str = "simple",
 ) -> Dict[str, Any]:
-    """Benchmarks H2D/D2H transfer using device_put/device_get."""
+    """Benchmarks H2D/D2H transfer using simple device_put/device_get."""
     
     num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize
     
@@ -32,18 +32,8 @@ def benchmark_host_device(
     column = 128
     host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
     
-    # Used in pipelined flow
-    num_devices_to_perform_h2d = 1
-    tensor_size = 4 * 1024 * 1024
-    target_device = jax.devices()[:num_devices_to_perform_h2d]
-    mesh = jax.sharding.Mesh(np.array(target_device), axis_names=["x"])
-    sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("x"))
-    pipelined_array = None
-    if h2d_type == "pipelined":
-        pipelined_array = np.random.normal(size=(tensor_size,)).astype(np.float32)
-
     print(
-        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}",
+        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations",
         flush=True
     )
 
@@ -75,52 +65,29 @@ def benchmark_host_device(
             
             with step_context:
                  # H2D
-                if h2d_type == "simple":
-                    t0 = time.perf_counter()
-                    # Simple device_put
-                    device_array = jax.device_put(host_data)
-                    device_array.block_until_ready()
-                    t1 = time.perf_counter()
-                    
-                    # Verify H2D shape
-                    assert device_array.shape == host_data.shape
-
-                    h2d_perf.append((t1 - t0) * 1000)
+                t0 = time.perf_counter()
                 
-                    # D2H
-                    t2 = time.perf_counter()
-                    
-                    # Simple device_get
-                    # Note: device_get returns a numpy array (copy)
-                    _ = jax.device_get(device_array)
-                    
-                    t3 = time.perf_counter()
-                    d2h_perf.append((t3 - t2) * 1000)
-                    
-                    device_array.delete()
-                elif h2d_type == "pipelined":
-                    tensors_on_device = []
-                    if data_size_mib * 1024 * 1024 < pipelined_array.nbytes:
-                        print(f"Warning: {data_size_mib=} is smaller than pipeline unit, no data will be transferred.")
-                    t0 = time.perf_counter()
-                    # Assume data_size_mib is total across devices for now
-                    bytes_left = 1024 * 1024 * data_size_mib
-                    while bytes_left >= pipelined_array.nbytes:
-                        with jax.profiler.StepTraceAnnotation("device_put", step_num=1):
-                            x_device = jax.device_put(pipelined_array, sharding)
-                            tensors_on_device.append(x_device)
-                            bytes_left -= pipelined_array.nbytes
-                    
-                    total_bytes_transferred = 0
-                    for tensor in tensors_on_device:
-                        tensor.block_until_ready()
-                        total_bytes_transferred += tensor.nbytes
-                        tensor.delete()
-                    t1 = time.perf_counter()
-
-                    h2d_perf.append((t1 - t0) * 1000)
-                    # Implement D2H at a later time after we establish H2D
-                    d2h_perf.append(0)
+                # Simple device_put
+                device_array = jax.device_put(host_data)
+                device_array.block_until_ready()
+                
+                t1 = time.perf_counter()
+                h2d_perf.append((t1 - t0) * 1000)
+                
+                # Verify H2D shape
+                assert device_array.shape == host_data.shape
+                
+                # D2H
+                t2 = time.perf_counter()
+                
+                # Simple device_get
+                # Note: device_get returns a numpy array (copy)
+                _ = jax.device_get(device_array)
+                
+                t3 = time.perf_counter()
+                d2h_perf.append((t3 - t2) * 1000)
+                
+                device_array.delete()
 
     return {
         "H2D_Bandwidth_ms": h2d_perf,
@@ -131,7 +98,6 @@ def benchmark_host_device_calculate_metrics(
     data_size_mib: int,
     H2D_Bandwidth_ms: List[float],
     D2H_Bandwidth_ms: List[float],
-    h2d_type: str = "simple",
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """Calculates metrics for Host-Device transfer."""
     params = locals().items()