Add diagnostic mode to lax.scan profiler

mmcky · mmcky · commit 1bfbaf9a1946 · 2025-11-27T21:22:13.000+11:00
- Add --diagnose flag that tests time scaling across iteration counts
- If time scales linearly with iterations (not compute), it proves
  constant per-iteration overhead (CPU-GPU synchronization)
- Also add --verbose flag for CUDA/XLA logging
- Update CI to run with --diagnose flag
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -32,9 +32,10 @@ jobs:
           echo "=== lax.scan Performance Profiling ==="
           echo "This profiles the known issue with lax.scan on GPU (JAX Issue #2491)"
           echo ""
-          python scripts/profile_lax_scan.py --iterations 100000
+          python scripts/profile_lax_scan.py --iterations 100000 --diagnose
           echo ""
-          echo "Note: GPU is expected to be much slower due to CPU-GPU sync per iteration"
+          echo "The diagnostic shows if time scales linearly with iterations,"
+          echo "which indicates constant per-iteration CPU-GPU sync overhead."
       # === Benchmark Tests (Bare Metal, Jupyter, Jupyter-Book) ===
       - name: Run Hardware Benchmarks (Bare Metal)
         shell: bash -l {0}
diff --git a/scripts/profile_lax_scan.py b/scripts/profile_lax_scan.py
@@ -34,6 +34,13 @@ def setup_xla_dump(dump_dir="/tmp/xla_dump"):
     os.environ["XLA_FLAGS"] = f"--xla_dump_to={dump_dir} --xla_dump_hlo_as_text"
     print(f"XLA dumps will be written to: {dump_dir}")
 
+def setup_cuda_logging():
+    """Enable CUDA/XLA logging to see sync patterns."""
+    # These may help reveal synchronization behavior
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"  # Show all TF/XLA logs
+    os.environ["XLA_FLAGS"] = os.environ.get("XLA_FLAGS", "") + " --xla_gpu_cuda_data_dir=/usr/local/cuda"
+    print("CUDA/XLA logging enabled")
+
 def main():
     parser = argparse.ArgumentParser(description="Profile lax.scan GPU performance")
     parser.add_argument("--nsys", action="store_true", 
@@ -42,6 +49,10 @@ def main():
                         help="Enable JAX profiler (view with TensorBoard)")
     parser.add_argument("--xla-dump", action="store_true",
                         help="Dump XLA HLO for analysis")
+    parser.add_argument("--verbose", action="store_true",
+                        help="Enable verbose CUDA/XLA logging")
+    parser.add_argument("--diagnose", action="store_true",
+                        help="Run diagnostic to demonstrate sync overhead")
     parser.add_argument("-n", "--iterations", type=int, default=10_000_000,
                         dest="n", help="Number of iterations (default: 10M)")
     parser.add_argument("--profile-dir", type=str, default="/tmp/jax-trace",
@@ -51,6 +62,9 @@ def main():
     # Setup XLA dump before importing JAX
     if args.xla_dump:
         setup_xla_dump()
+    
+    if args.verbose:
+        setup_cuda_logging()
 
     # Now import JAX
     import jax
@@ -170,5 +184,58 @@ def update(x, t):
         print("\nNsight Systems trace will be saved as lax_scan_profile.nsys-rep")
         print("View with: nsys-ui lax_scan_profile.nsys-rep")
 
+    # Diagnostic: demonstrate sync overhead by showing time scaling
+    if args.diagnose:
+        print("\n" + "=" * 60)
+        print("DIAGNOSTIC: Per-iteration Sync Overhead Analysis")
+        print("=" * 60)
+        print("\nIf there's a CPU-GPU sync per iteration, time should scale")
+        print("linearly with iteration count (not with compute work).\n")
+        
+        # Test different iteration counts
+        test_ns = [1000, 5000, 10000, 50000, 100000]
+        
+        print("Iteration Count | GPU Time (s) | Time/Iter (µs) | Expected if O(n)")
+        print("-" * 70)
+        
+        gpu_times = []
+        for test_n in test_ns:
+            # Define fresh function for this n
+            @partial(jax.jit, static_argnums=(1,))
+            def qm_test(x0, n, α=4.0):
+                def update(x, t):
+                    return α * x * (1 - x), α * x * (1 - x)
+                _, x = lax.scan(update, x0, jnp.arange(n))
+                return jnp.concatenate([jnp.array([x0]), x])
+            
+            # Compile
+            _ = qm_test(0.1, test_n).block_until_ready()
+            
+            # Time
+            t0 = time.perf_counter()
+            _ = qm_test(0.1, test_n).block_until_ready()
+            elapsed = time.perf_counter() - t0
+            gpu_times.append(elapsed)
+            
+            time_per_iter = (elapsed / test_n) * 1_000_000  # microseconds
+            expected = gpu_times[0] * (test_n / test_ns[0]) if gpu_times else elapsed
+            
+            print(f"{test_n:>15,} | {elapsed:>12.6f} | {time_per_iter:>14.2f} | {expected:.6f}")
+        
+        # Calculate if time scales linearly (indicating per-iteration overhead)
+        ratio_1k_to_100k = gpu_times[-1] / gpu_times[0]
+        expected_ratio = test_ns[-1] / test_ns[0]  # 100x if linear
+        
+        print(f"\nScaling analysis:")
+        print(f"  Time ratio (100k/1k iterations): {ratio_1k_to_100k:.1f}x")
+        print(f"  Expected if linear O(n): {expected_ratio:.1f}x")
+        
+        if 0.5 * expected_ratio < ratio_1k_to_100k < 2.0 * expected_ratio:
+            print("\n✓ Time scales ~linearly with iterations!")
+            print("  This indicates constant per-iteration overhead (CPU-GPU sync).")
+            print(f"  Estimated sync overhead: ~{(gpu_times[0]/test_ns[0])*1e6:.1f} µs per iteration")
+        else:
+            print("\n? Scaling is not linear - may be other factors involved")
+
 if __name__ == "__main__":
     main()