diff --git a/benchmarks/attention/benchmark_attention.py b/benchmarks/attention/benchmark_attention.py index 77b2da0b10..c4e20a196e 100644 --- a/benchmarks/attention/benchmark_attention.py +++ b/benchmarks/attention/benchmark_attention.py @@ -1,6 +1,10 @@ # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # See LICENSE for license information. +# +# Note: A "flash-attn v3" warning may appear from Transformer Engine. The script does not +# install flash-attn; it uses whatever is already installed (e.g. v2). The warning suggests +# installing v3 for Hopper+ for better support; timings are still from the active backend. import os, sys, time import subprocess @@ -9,10 +13,14 @@ import torch import nvtx import transformer_engine -from tests.pytorch.utils import ( - ModelConfig, - get_available_attention_backends, -) + +# Add project root so "tests" can be imported when run from any directory +_script_dir = os.path.dirname(os.path.abspath(__file__)) +_project_root = os.path.dirname(os.path.dirname(_script_dir)) # repo root (parent of benchmarks/) +if _project_root not in sys.path: + sys.path.insert(0, _project_root) + +from tests.pytorch.utils import ModelConfig, get_available_attention_backends from tests.pytorch.attention.test_attention import _run_dot_product_attention pd.set_option("display.precision", 4) @@ -32,16 +40,39 @@ # training mode is_training = True +# Substrings to match kernel names in nsys cuda_gpu_trace CSV (case-insensitive). +# If profiling output changes, update these (e.g. cuDNN may use "cudnn" or "cuda", flash may use "flash" or "fmha"). +KERNEL_NAME_CUDNN = "cudnn" +KERNEL_NAME_FLASH = "flash" + model_configs = { - # test: b, h, hg, d, sq, skv, p, mask, bias - "test_0": ModelConfig(2, 16, 16, 64, 512, 512, 0.0, "no_mask", "no_bias"), # short seq - "test_1": ModelConfig(2, 16, 16, 128, 2048, 2048, 0.0, "causal", "no_bias"), # longer seq, mask - "test_2": ModelConfig(2, 16, 16, 128, 2048, 2048, 0.0, "causal", "post_scale_bias"), # bias - "test_3": ModelConfig(2, 32, 4, 128, 8192, 8192, 0.0, "causal", "no_bias"), # GQA + # ModelConfig(batch_size, max_seqlen_q, num_heads, head_dim_qk, max_seqlen_kv, num_gqa_groups, ...) + "test_0": ModelConfig( + 2, 512, 16, 64, 512, 16, dropout_p=0.0, attn_mask_type="no_mask", attn_bias_type="no_bias" + ), # short seq + "test_1": ModelConfig( + 2, 2048, 16, 128, 2048, 16, dropout_p=0.0, attn_mask_type="causal", attn_bias_type="no_bias" + ), # longer seq, mask + "test_2": ModelConfig( + 2, + 2048, + 16, + 128, + 2048, + 16, + dropout_p=0.0, + attn_mask_type="causal", + attn_bias_type="post_scale_bias", + ), # bias; FlashAttention does not support post_scale_bias, so only cuDNN runs + "test_3": ModelConfig( + 2, 8192, 32, 128, 8192, 4, dropout_p=0.0, attn_mask_type="causal", attn_bias_type="no_bias" + ), # GQA } -def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supported): +def benchmark_dot_product_attention( + model, fused_attn_supported, flash_attn_supported, append_csv=True +): config = model_configs[model] if dtype == torch.bfloat16: tols = dict(atol=2.5e-2, rtol=2.5e-2) @@ -53,7 +84,7 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp warmup_iters = 3 for i in range(warmup_iters): if fused_attn_supported: - fused_attn_fwd, fused_attn_bwd = _run_dot_product_attention( + fused_attn_fwd, _, fused_attn_bwd = _run_dot_product_attention( dtype, config, "FusedAttention", @@ -64,7 +95,7 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp is_training, ) if flash_attn_supported: - flash_attn_fwd, flash_attn_bwd = _run_dot_product_attention( + flash_attn_fwd, _, flash_attn_bwd = _run_dot_product_attention( dtype, config, "FlashAttention", @@ -77,14 +108,15 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp if fused_attn_supported and flash_attn_supported: torch.testing.assert_close(fused_attn_fwd, flash_attn_fwd, **tols) for i, _ in enumerate(flash_attn_bwd): - torch.testing.assert_close(fused_attn_bwd[i], flash_attn_bwd[i], **tols) + if fused_attn_bwd[i] is not None and flash_attn_bwd[i] is not None: + torch.testing.assert_close(fused_attn_bwd[i], flash_attn_bwd[i], **tols) torch.cuda.cudart().cudaProfilerStart() torch.cuda.synchronize() fused_attn_start = time.time() if fused_attn_supported: for i in range(num_iters): - fused_attn_fwd, fused_attn_bwd = _run_dot_product_attention( + _run_dot_product_attention( dtype, config, "FusedAttention", @@ -101,7 +133,7 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp flash_attn_start = time.time() if flash_attn_supported: for i in range(num_iters): - flash_attn_fwd, flash_attn_bwd = _run_dot_product_attention( + _run_dot_product_attention( dtype, config, "FlashAttention", @@ -114,61 +146,75 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp torch.cuda.synchronize() flash_attn_time = time.time() - flash_attn_start if flash_attn_supported else 0 - df = pd.read_csv("times.csv") - df = pd.concat( - [ - df, - pd.DataFrame( - [ + if append_csv: + df = pd.read_csv("times.csv") + df = pd.concat( + [ + df, + pd.DataFrame( [ - fused_attn_time * 1e3 / num_iters, - 0, - 0, - 0, - flash_attn_time * 1e3 / num_iters, - 0, - 0, - 0, - 0, - ] - ], - columns=df.columns, - ), - ], - ignore_index=True, - ) - df.to_csv("times.csv", index=False) + [ + fused_attn_time * 1e3 / num_iters, + 0, + 0, + 0, + flash_attn_time * 1e3 / num_iters, + 0, + 0, + 0, + 0, + ] + ], + columns=df.columns, + ), + ], + ignore_index=True, + ) + df.to_csv("times.csv", index=False) torch.cuda.cudart().cudaProfilerStop() def parse_results(per_cudnn, per_flash, model): + bench_dir = os.path.dirname(os.path.abspath(__file__)) filename = f"prof_{model}_cuda_gpu_trace.csv" - df = pd.read_csv(os.path.join("./", filename)) - df_times = pd.read_csv("times.csv") + filepath = os.path.join(bench_dir, filename) + if not os.path.isfile(filepath): + return + df = pd.read_csv(filepath) + df_times = pd.read_csv(os.path.join(bench_dir, "times.csv")) row = len(df_times.index) - 1 + # Match kernel names case-insensitively; column may be "Name" or "Kernel Name" in nsys output + name_col = "Name" if "Name" in df.columns else "Kernel Name" + names = df[name_col].astype(str).str.lower() + if per_cudnn > 0: - t_cudnn_all = df[df["Name"].str.contains("cudnn")]["Duration (ns)"].to_numpy() - t_cudnn_all = t_cudnn_all.reshape(-1, per_cudnn) - t_cudnn_avg = np.average(t_cudnn_all, axis=0) - df_times.loc[row, "FusedAttention Kernels (fwd)"] = t_cudnn_avg[0] / 1e6 - df_times.loc[row, "FusedAttention Kernels (bwd)"] = t_cudnn_avg[1:4].sum() / 1e6 - df_times.loc[row, "FusedAttention Kernels (fwd+bwd)"] = t_cudnn_avg.sum() / 1e6 + cudnn_mask = names.str.contains(KERNEL_NAME_CUDNN.lower(), regex=False) + if cudnn_mask.any(): + t_cudnn_all = df.loc[cudnn_mask, "Duration (ns)"].to_numpy() + t_cudnn_all = t_cudnn_all.reshape(-1, per_cudnn) + t_cudnn_avg = np.average(t_cudnn_all, axis=0) + df_times.loc[row, "FusedAttention Kernels (fwd)"] = t_cudnn_avg[0] / 1e6 + df_times.loc[row, "FusedAttention Kernels (bwd)"] = t_cudnn_avg[1:4].sum() / 1e6 + df_times.loc[row, "FusedAttention Kernels (fwd+bwd)"] = t_cudnn_avg.sum() / 1e6 if per_flash > 0: - t_flash_all = df[df["Name"].str.contains("flash")]["Duration (ns)"].to_numpy() - t_flash_all = t_flash_all.reshape(-1, per_flash) - t_flash_avg = np.average(t_flash_all, axis=0) - df_times.loc[row, "FlashAttention Kernels (fwd)"] = t_flash_avg[0] / 1e6 - df_times.loc[row, "FlashAttention Kernels (bwd)"] = t_flash_avg[1:4].sum() / 1e6 - df_times.loc[row, "FlashAttention Kernels (fwd+bwd)"] = t_flash_avg.sum() / 1e6 + flash_mask = names.str.contains(KERNEL_NAME_FLASH.lower(), regex=False) + if flash_mask.any(): + t_flash_all = df.loc[flash_mask, "Duration (ns)"].to_numpy() + t_flash_all = t_flash_all.reshape(-1, per_flash) + t_flash_avg = np.average(t_flash_all, axis=0) + df_times.loc[row, "FlashAttention Kernels (fwd)"] = t_flash_avg[0] / 1e6 + df_times.loc[row, "FlashAttention Kernels (bwd)"] = t_flash_avg[1:4].sum() / 1e6 + df_times.loc[row, "FlashAttention Kernels (fwd+bwd)"] = t_flash_avg.sum() / 1e6 if per_cudnn > 0 and per_flash > 0: - df_times.loc[row, "Fused vs Flash Kernels Speedup (fwd+bwd)"] = ( - df_times.loc[row, "FlashAttention Kernels (fwd+bwd)"] - / df_times.loc[row, "FusedAttention Kernels (fwd+bwd)"] - ) - df_times.to_csv("times.csv", index=False) + fwd_bwd = df_times.loc[row, "FusedAttention Kernels (fwd+bwd)"] + if fwd_bwd and fwd_bwd > 0: + df_times.loc[row, "Fused vs Flash Kernels Speedup (fwd+bwd)"] = ( + df_times.loc[row, "FlashAttention Kernels (fwd+bwd)"] / fwd_bwd + ) + df_times.to_csv(os.path.join(bench_dir, "times.csv"), index=False) def main(): @@ -201,7 +247,7 @@ def main(): config, qkv_dtype=dtype, qkv_layout=qkv_layout, - window_size=config.window_size, + # window_size=config.window_size, pad_between_seqs=pad_between_seqs, ) flash_attn_supported, fused_attn_supported, unfused_attn_supported = available_backends @@ -211,6 +257,17 @@ def main(): f'{" and flash-attention" if flash_attn_supported else ""}...' ) + # Run benchmark in main process so times.csv always gets a row (works without nsys) + benchmark_dot_product_attention( + model, fused_attn_supported, flash_attn_supported, append_csv=True + ) + + # Optional: run under nsys to get kernel-level stats; subprocess must not append again + bench_code = ( + "import benchmark_attention; " + "benchmark_attention.benchmark_dot_product_attention(" + f"'{model}', {fused_attn_supported}, {flash_attn_supported}, append_csv=False)" + ) prof_cmd = [ "nsys", "profile", @@ -220,58 +277,80 @@ def main(): f"--output=prof_{model}", "python", "-c", - f""" "import benchmark_attention;""", - f"""benchmark_attention.benchmark_dot_product_attention(""" - f"""'{model}', {fused_attn_supported}, {flash_attn_supported})" """, - ] - prof_cmd = " ".join(prof_cmd) - subprocess.call(prof_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True) - stats_cmd = [ - "nsys", - "stats", - "-q", - "-r", - "cuda_gpu_trace", - "--format", - "csv,column", - "--force-overwrite=true", - "--force-export=true", - f"--output=prof_{model}", - f"prof_{model}.nsys-rep", - ] - if fused_attn_supported: - num_kernels_cudnn = 4 - if config.attn_bias_type == "post_scale_bias": - num_kernels_cudnn = num_kernels_cudnn + 1 - if config.num_heads != config.num_gqa_groups: - num_kernels_cudnn = num_kernels_cudnn + 2 - else: - num_kernels_cudnn = 0 - num_kernels_flash = 4 if flash_attn_supported else 0 - stats_cmd = " ".join(stats_cmd) - subprocess.call(stats_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True) - parse_cmd = [ - "python", - "-c", - f""" "import benchmark_attention;""", - f"""benchmark_attention.parse_results(""" - f"""{num_kernels_cudnn}, {num_kernels_flash}, '{model}')" """, + bench_code, ] - parse_cmd = " ".join(parse_cmd) - subprocess.call(parse_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True) + bench_dir = os.path.dirname(os.path.abspath(__file__)) + prof_ret = subprocess.call( + prof_cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + cwd=bench_dir, + ) + if prof_ret == 0: + stats_cmd = [ + "nsys", + "stats", + "-q", + "-r", + "cuda_gpu_trace", + "--format", + "csv,column", + "--force-overwrite=true", + "--force-export=true", + f"--output=prof_{model}", + f"prof_{model}.nsys-rep", + ] + if fused_attn_supported: + num_kernels_cudnn = 4 + if config.attn_bias_type == "post_scale_bias": + num_kernels_cudnn = num_kernels_cudnn + 1 + if config.num_heads != config.num_gqa_groups: + num_kernels_cudnn = num_kernels_cudnn + 2 + else: + num_kernels_cudnn = 0 + num_kernels_flash = 4 if flash_attn_supported else 0 + subprocess.call( + stats_cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + cwd=bench_dir, + ) + parse_code = ( + "import benchmark_attention; " + "benchmark_attention.parse_results(" + f"{num_kernels_cudnn}, {num_kernels_flash}, '{model}')" + ) + parse_cmd = ["python", "-c", parse_code] + subprocess.call( + parse_cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + cwd=bench_dir, + ) df_times = pd.read_csv("times.csv") + n_models = len(model_configs) + if len(df_times) != n_models: + raise RuntimeError( + f"times.csv has {len(df_times)} rows but expected {n_models}. " + "Subprocess benchmarks may have failed (check nsys availability)." + ) df_times.index = list(model_configs.keys()) - a = df_times[ - [ - "FusedAttention Kernels (fwd+bwd)", - "FlashAttention Kernels (fwd+bwd)", - "Fused vs Flash Kernels Speedup (fwd+bwd)", - ] - ] - a.columns = ["cuDNN fwd+bwd (ms)", "flash-attn fwd+bwd (ms)", "cuDNN vs flash speedup"] + # Prefer module timings (from time.time(), always populated); fall back to kernel timings (from nsys) + cudnn_col = "FusedAttention Module" + flash_col = "FlashAttention Module" + a = df_times[[cudnn_col, flash_col]].copy() + a.columns = ["cuDNN fwd+bwd (ms)", "flash-attn fwd+bwd (ms)"] + # Speedup: flash/cudnn ratio (>1 means cuDNN faster). N/A when only one backend ran (e.g. test_2 has bias, flash not used). + cudnn_ms = df_times[cudnn_col] + flash_ms = df_times[flash_col] + speedup = np.where((cudnn_ms > 0) & (flash_ms > 0), flash_ms / cudnn_ms, np.nan) + a["cuDNN vs flash speedup"] = speedup + # Show "N/A" instead of NaN when speedup not defined (only one backend ran) + a_display = a.copy() + a_display["cuDNN vs flash speedup"] = [f"{x:.4f}" if not pd.isna(x) else "N/A" for x in speedup] print() - print(a) + print(a_display) if __name__ == "__main__":