DEBUG: Add hardware benchmark script to diagnose performance

mmcky · mmcky · commit 0b6a5673d287 · 2025-11-27T15:50:42.000+11:00
- Add benchmark-hardware.py with CPU, NumPy, Numba, and JAX benchmarks
- Works on both GPU (RunsOn) and CPU-only (GitHub Actions) runners
- Include warm-up vs compiled timing to isolate JIT overhead
- Add system info collection (CPU model, frequency, GPU detection)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -25,6 +25,9 @@ jobs:
           pip install -U "jax[cuda13]"
           pip install numpyro
           python scripts/test-jax-install.py
+      - name: Run Hardware Benchmarks
+        shell: bash -l {0}
+        run: python scripts/benchmark-hardware.py
       - name: Install latex dependencies
         run: |
           sudo apt-get -qq update
diff --git a/scripts/benchmark-hardware.py b/scripts/benchmark-hardware.py
@@ -0,0 +1,264 @@
+"""
+Hardware benchmark script for CI runners.
+Compares CPU and GPU performance to diagnose slowdowns.
+Works on both CPU-only (GitHub Actions) and GPU (RunsOn) runners.
+"""
+import time
+import platform
+import os
+
+def get_cpu_info():
+    """Get CPU information."""
+    print("=" * 60)
+    print("SYSTEM INFORMATION")
+    print("=" * 60)
+    print(f"Platform: {platform.platform()}")
+    print(f"Processor: {platform.processor()}")
+    print(f"Python: {platform.python_version()}")
+    
+    # Try to get CPU frequency
+    try:
+        with open('/proc/cpuinfo', 'r') as f:
+            for line in f:
+                if 'model name' in line:
+                    print(f"CPU Model: {line.split(':')[1].strip()}")
+                    break
+    except:
+        pass
+    
+    # Try to get CPU frequency
+    try:
+        with open('/proc/cpuinfo', 'r') as f:
+            for line in f:
+                if 'cpu MHz' in line:
+                    print(f"CPU MHz: {line.split(':')[1].strip()}")
+                    break
+    except:
+        pass
+    
+    # CPU count
+    print(f"CPU Count: {os.cpu_count()}")
+    
+    # Check for GPU
+    try:
+        import subprocess
+        result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'], 
+                              capture_output=True, text=True, timeout=5)
+        if result.returncode == 0:
+            print(f"GPU: {result.stdout.strip()}")
+        else:
+            print("GPU: None detected")
+    except:
+        print("GPU: None detected (nvidia-smi not available)")
+    
+    print()
+
+def benchmark_cpu_pure_python():
+    """Pure Python CPU benchmark."""
+    print("=" * 60)
+    print("CPU BENCHMARK: Pure Python")
+    print("=" * 60)
+    
+    # Integer computation
+    start = time.perf_counter()
+    total = sum(i * i for i in range(10_000_000))
+    elapsed = time.perf_counter() - start
+    print(f"Integer sum (10M iterations): {elapsed:.3f} seconds")
+    
+    # Float computation
+    start = time.perf_counter()
+    total = 0.0
+    for i in range(1_000_000):
+        total += (i * 0.1) ** 0.5
+    elapsed = time.perf_counter() - start
+    print(f"Float sqrt (1M iterations): {elapsed:.3f} seconds")
+    print()
+
+def benchmark_cpu_numpy():
+    """NumPy CPU benchmark."""
+    import numpy as np
+    
+    print("=" * 60)
+    print("CPU BENCHMARK: NumPy")
+    print("=" * 60)
+    
+    # Matrix multiplication
+    n = 3000
+    A = np.random.randn(n, n)
+    B = np.random.randn(n, n)
+    
+    start = time.perf_counter()
+    C = A @ B
+    elapsed = time.perf_counter() - start
+    print(f"Matrix multiply ({n}x{n}): {elapsed:.3f} seconds")
+    
+    # Element-wise operations
+    x = np.random.randn(50_000_000)
+    
+    start = time.perf_counter()
+    y = np.cos(x**2) + np.sin(x)
+    elapsed = time.perf_counter() - start
+    print(f"Element-wise ops (50M elements): {elapsed:.3f} seconds")
+    print()
+
+def benchmark_gpu_jax():
+    """JAX benchmark (GPU if available, otherwise CPU)."""
+    try:
+        import jax
+        import jax.numpy as jnp
+        
+        devices = jax.devices()
+        default_backend = jax.default_backend()
+        
+        # Check if GPU is available
+        has_gpu = any('cuda' in str(d).lower() or 'gpu' in str(d).lower() for d in devices)
+        
+        print("=" * 60)
+        if has_gpu:
+            print("JAX BENCHMARK: GPU")
+        else:
+            print("JAX BENCHMARK: CPU (no GPU detected)")
+        print("=" * 60)
+        
+        print(f"JAX devices: {devices}")
+        print(f"Default backend: {default_backend}")
+        print(f"GPU Available: {has_gpu}")
+        print()
+        
+        # Warm-up JIT compilation
+        print("Warming up JIT compilation...")
+        n = 1000
+        key = jax.random.PRNGKey(0)
+        A = jax.random.normal(key, (n, n))
+        B = jax.random.normal(key, (n, n))
+        
+        @jax.jit
+        def matmul(a, b):
+            return jnp.dot(a, b)
+        
+        # Warm-up run (includes compilation)
+        start = time.perf_counter()
+        C = matmul(A, B).block_until_ready()
+        warmup_time = time.perf_counter() - start
+        print(f"Warm-up (includes JIT compile, {n}x{n}): {warmup_time:.3f} seconds")
+        
+        # Actual benchmark (compiled)
+        start = time.perf_counter()
+        C = matmul(A, B).block_until_ready()
+        elapsed = time.perf_counter() - start
+        print(f"Matrix multiply compiled ({n}x{n}): {elapsed:.3f} seconds")
+        
+        # Larger matrix
+        n = 3000
+        A = jax.random.normal(key, (n, n))
+        B = jax.random.normal(key, (n, n))
+        
+        # Warm-up for new size
+        start = time.perf_counter()
+        C = matmul(A, B).block_until_ready()
+        warmup_time = time.perf_counter() - start
+        print(f"Warm-up (recompile for {n}x{n}): {warmup_time:.3f} seconds")
+        
+        # Benchmark compiled
+        start = time.perf_counter()
+        C = matmul(A, B).block_until_ready()
+        elapsed = time.perf_counter() - start
+        print(f"Matrix multiply compiled ({n}x{n}): {elapsed:.3f} seconds")
+        
+        # Element-wise GPU benchmark
+        x = jax.random.normal(key, (50_000_000,))
+        
+        @jax.jit
+        def elementwise_ops(x):
+            return jnp.cos(x**2) + jnp.sin(x)
+        
+        # Warm-up
+        start = time.perf_counter()
+        y = elementwise_ops(x).block_until_ready()
+        warmup_time = time.perf_counter() - start
+        print(f"Element-wise warm-up (50M): {warmup_time:.3f} seconds")
+        
+        # Compiled
+        start = time.perf_counter()
+        y = elementwise_ops(x).block_until_ready()
+        elapsed = time.perf_counter() - start
+        print(f"Element-wise compiled (50M): {elapsed:.3f} seconds")
+        
+        print()
+        
+    except ImportError as e:
+        print(f"JAX not available: {e}")
+    except Exception as e:
+        print(f"JAX benchmark failed: {e}")
+
+def benchmark_numba():
+    """Numba CPU benchmark."""
+    try:
+        import numba
+        import numpy as np
+        
+        print("=" * 60)
+        print("CPU BENCHMARK: Numba")
+        print("=" * 60)
+        
+        @numba.jit(nopython=True)
+        def numba_sum(n):
+            total = 0
+            for i in range(n):
+                total += i * i
+            return total
+        
+        # Warm-up (compilation)
+        start = time.perf_counter()
+        result = numba_sum(10_000_000)
+        warmup_time = time.perf_counter() - start
+        print(f"Integer sum warm-up (includes compile): {warmup_time:.3f} seconds")
+        
+        # Compiled run
+        start = time.perf_counter()
+        result = numba_sum(10_000_000)
+        elapsed = time.perf_counter() - start
+        print(f"Integer sum compiled (10M): {elapsed:.3f} seconds")
+        
+        @numba.jit(nopython=True, parallel=True)
+        def numba_parallel_sum(arr):
+            total = 0.0
+            for i in numba.prange(len(arr)):
+                total += arr[i] ** 2
+            return total
+        
+        arr = np.random.randn(50_000_000)
+        
+        # Warm-up
+        start = time.perf_counter()
+        result = numba_parallel_sum(arr)
+        warmup_time = time.perf_counter() - start
+        print(f"Parallel sum warm-up (50M): {warmup_time:.3f} seconds")
+        
+        # Compiled
+        start = time.perf_counter()
+        result = numba_parallel_sum(arr)
+        elapsed = time.perf_counter() - start
+        print(f"Parallel sum compiled (50M): {elapsed:.3f} seconds")
+        
+        print()
+        
+    except ImportError as e:
+        print(f"Numba not available: {e}")
+    except Exception as e:
+        print(f"Numba benchmark failed: {e}")
+
+if __name__ == "__main__":
+    print("\n" + "=" * 60)
+    print("HARDWARE BENCHMARK FOR CI RUNNER")
+    print("=" * 60 + "\n")
+    
+    get_cpu_info()
+    benchmark_cpu_pure_python()
+    benchmark_cpu_numpy()
+    benchmark_numba()
+    benchmark_gpu_jax()
+    
+    print("=" * 60)
+    print("BENCHMARK COMPLETE")
+    print("=" * 60)