sam: enable flash attention

Acly · Acly · commit d6311096c755 · 2025-12-08T12:51:05.000+01:00
diff --git a/src/visp/arch/mobile-sam.cpp b/src/visp/arch/mobile-sam.cpp
@@ -1,8 +1,8 @@
 #include "visp/arch/mobile-sam.h"
-#include "visp/nn.h"
-#include "visp/vision.h"
 #include "util/math.h"
 #include "util/string.h"
+#include "visp/nn.h"
+#include "visp/vision.h"
 
 #include <ggml.h>
 
@@ -13,7 +13,7 @@ namespace visp {
 namespace sam {
 
 tensor conv_2d_batch_norm(model_ref m, tensor x, int stride = 1, int pad = 0) {
-     // batch_norm is fused into conv_2d when converting the model
+    // batch_norm is fused into conv_2d when converting the model
     return conv_2d(m["c"], x, stride, pad);
 }
 
@@ -68,7 +68,6 @@ tensor window_reverse(model_ref m, tensor x, int w, int h, int window) {
 // Image encoder
 //
 
-
 tensor patch_embed(model_ref m, tensor x) {
     x = conv_2d_batch_norm(m["seq.0"], x, 2, 1);
     x = ggml_gelu_inplace(m, x);
@@ -142,17 +141,30 @@ tensor attention_rel_bias(model_ref m, tensor x, int dim, int num_heads) {
     tensor q = split(m, qkv, 0);
     tensor k = split(m, qkv, 1);
     tensor v = split(m, qkv, 2);
-    q = ggml_cont(m, ggml_permute(m, q, 0, 2, 1, 3));
-    k = ggml_cont(m, ggml_permute(m, k, 0, 2, 1, 3));
-    v = ggml_cont(m, ggml_permute(m, v, 1, 2, 0, 3)); // transpose for mul_mat later
+    tensor mask = m.weights("attention_biases_indexed");
+    float scale = 1.0f / std::sqrt(float(key_dim));
+
+    if (m.flags & model_build_flag::flash_attention) {
+        q = ggml_cont(m, ggml_permute(m, q, 0, 2, 1, 3));
+        k = ggml_cast(m, ggml_permute(m, k, 0, 2, 1, 3), GGML_TYPE_F16);
+        v = ggml_cast(m, ggml_permute(m, v, 0, 2, 1, 3), GGML_TYPE_F16);
+        if (mask->type != GGML_TYPE_F16) {
+            mask = ggml_cast(m, mask, GGML_TYPE_F16);
+        }
 
-    tensor attn = ggml_mul_mat(m, k, q); // q @ k (k is transposed in mul_mat)
-    attn = ggml_scale_inplace(m, attn, 1.0f / std::sqrt(float(key_dim)));
-    attn = ggml_add_inplace(m, attn, m.weights("attention_biases_indexed"));
-    attn = ggml_soft_max(m, attn);
+        x = ggml_flash_attn_ext(m, q, k, v, mask, scale, 0.0f, 0.0f);
+        ggml_flash_attn_ext_set_prec(x, GGML_PREC_F32);
+    } else {
+        q = ggml_cont(m, ggml_permute(m, q, 0, 2, 1, 3));
+        k = ggml_cont(m, ggml_permute(m, k, 0, 2, 1, 3));
+        v = ggml_cont(m, ggml_permute(m, v, 1, 2, 0, 3)); // transpose for mul_mat later
 
-    x = ggml_mul_mat(m, v, attn);                     // attn @ v
-    x = ggml_cont(m, ggml_permute(m, x, 0, 2, 1, 3)); // transpose(1, 2)
+        tensor attn = ggml_mul_mat(m, k, q); // q @ k (k is transposed in mul_mat)
+        attn = ggml_soft_max_ext(m, attn, mask, scale, 0.0f);
+
+        x = ggml_mul_mat(m, v, attn);                     // attn @ v
+        x = ggml_cont(m, ggml_permute(m, x, 0, 2, 1, 3)); // transpose(1, 2)
+    }
     x = ggml_reshape_3d(m, x, key_dim * num_heads, n, b);
     x = linear(m["proj"], x);
 
diff --git a/tests/test_mobile_sam.py b/tests/test_mobile_sam.py
@@ -6,7 +6,7 @@
 from torch import Tensor
 
 from . import workbench
-from .workbench import to_nhwc, to_nchw, convert_to_nhwc, fuse_conv_2d_batch_norm
+from .workbench import to_nhwc, to_nchw, convert_to_nhwc, fuse_conv_2d_batch_norm, tensors_match
 
 torch.set_printoptions(precision=2, linewidth=100, sci_mode=False)
 
@@ -53,7 +53,7 @@ def test_conv_2d_batch_norm(bias: bool):
     result = workbench.invoke_test("sam_conv_2d_batch_norm", x, state, nhwc_layout)
     result = to_nchw(result)
 
-    assert torch.allclose(result, expected)
+    assert tensors_match(result, expected)
 
 
 class PatchEmbed(torch.nn.Module):
@@ -98,7 +98,7 @@ def test_patch_embed():
     result = workbench.invoke_test("sam_patch_embed", x, state, nhwc_layout)
     result = to_nchw(result)
 
-    assert torch.allclose(result, expected, rtol=0.001, atol=0.02)
+    assert tensors_match(result, expected, rtol=0.001, atol=0.02)
 
 
 class LayerNorm2d(torch.nn.Module):
@@ -130,7 +130,7 @@ def test_layer_norm_2d():
     result = workbench.invoke_test("layer_norm", x, state, nhwc_layout)
     result = to_nchw(result)
 
-    assert torch.allclose(result, expected, rtol=0.001, atol=0.02)
+    assert tensors_match(result, expected, rtol=0.001, atol=0.02)
 
 
 class MBConv(torch.nn.Module):
@@ -193,7 +193,7 @@ def test_mb_conv():
     result = to_nchw(result)
 
     # precision: ggml_gelu uses fp16 look-up table & tanh approximation
-    assert torch.allclose(result, expected, rtol=0.001, atol=0.02)
+    assert tensors_match(result, expected, rtol=0.001, atol=0.02)
 
 
 class PatchMerging(torch.nn.Module):
@@ -244,7 +244,7 @@ def test_patch_merging():
     result = result.transpose(1, 2).reshape_as(expected)
 
     # precision: ggml_gelu uses fp16 look-up table & tanh approximation
-    assert torch.allclose(result, expected, rtol=0.001, atol=0.02)
+    assert tensors_match(result, expected, rtol=0.001, atol=0.02)
 
 
 class Mlp(torch.nn.Module):
@@ -288,7 +288,7 @@ def test_mlp():
     result = workbench.invoke_test("sam_mlp", x, state)
 
     # precision: ggml_gelu uses fp16 look-up table & tanh approximation
-    assert torch.allclose(result, expected, rtol=0.001, atol=0.02)
+    assert tensors_match(result, expected, rtol=0.001, atol=0.02)
 
 
 class AttentionRelBias(torch.nn.Module):
@@ -370,8 +370,8 @@ def forward(self, x):  # x (B,N,C)
         x = self.proj(x)
         return x
 
-
-def test_attention_rel_bias():
+@pytest.mark.parametrize("attn", ["default", "flash_attn"])
+def test_attention_rel_bias(attn:str):
     attention = AttentionRelBias(4, 2, num_heads=2, attn_ratio=1, resolution=(3, 3))
     state = workbench.randomize(attention.state_dict())
     attention.load_state_dict(state)
@@ -381,9 +381,9 @@ def test_attention_rel_bias():
     expected = attention(x)
 
     state["attention_biases_indexed"] = state["attention_biases"][:, attention.attention_bias_idxs]
-    result = workbench.invoke_test("sam_attention_rel_bias", x, state)
+    result = workbench.invoke_test("sam_attention_rel_bias", x, state, {"attn": attn})
 
-    assert torch.allclose(result, expected, atol=0.001)
+    assert tensors_match(result, expected, atol=0.001)
 
 
 class TinyViTBlock(torch.nn.Module):
@@ -495,7 +495,7 @@ def test_tiny_vit_block():
     state = convert_to_nhwc(state)
     result = workbench.invoke_test("sam_tiny_vit_block", x, state, nhwc_layout)
 
-    assert torch.allclose(result, expected, rtol=0.001, atol=0.02)
+    assert tensors_match(result, expected, rtol=0.001, atol=0.02)
 
 
 class ConvLayer(torch.nn.Module):
@@ -787,7 +787,7 @@ def test_tiny_vit():
     # result = torch.zeros_like(expected).contiguous()
     # result = workbench.invoke_test("sam_tiny_vit", x, state)
 
-    # assert torch.allclose(result, expected, rtol=0.001, atol=0.02)
+    # assert tensors_match(result, expected, rtol=0.001, atol=0.02)
 
 
 #
@@ -835,7 +835,7 @@ def test_position_embedding_random():
 
     result = workbench.invoke_test("sam_position_embedding_random", x, state)
 
-    assert torch.allclose(result, expected)
+    assert tensors_match(result, expected)
 
 
 class PromptEncoder(torch.nn.Module):
@@ -951,7 +951,7 @@ def test_prompt_encoder_points():
     points = torch.cat([points, -torch.ones(1, 1, 2)], dim=1)
     result = workbench.invoke_test("sam_embed_points", points, state)
 
-    assert torch.allclose(result, expected)
+    assert tensors_match(result, expected)
 
 
 def test_prompt_encoder_box():
@@ -970,7 +970,7 @@ def test_prompt_encoder_box():
 
     result = workbench.invoke_test("sam_embed_box", boxes, state)
 
-    assert torch.allclose(result, expected)
+    assert tensors_match(result, expected)
 
 
 #
@@ -1046,7 +1046,7 @@ def test_attention():
     state["input_v"] = v
     result = workbench.invoke_test("sam_attention", q, state)
 
-    assert torch.allclose(result, expected)
+    assert tensors_match(result, expected)
 
 
 class MLPBlock(torch.nn.Module):
@@ -1155,8 +1155,8 @@ def test_two_way_attention_block(mode):
         "sam_two_way_attention_block", queries, state, {"mode": mode}
     )
 
-    assert torch.allclose(result_queries, expected_queries)
-    assert torch.allclose(result_keys, expected_keys)
+    assert tensors_match(result_queries, expected_queries)
+    assert tensors_match(result_keys, expected_keys)
 
 
 class TwoWayTransformer(torch.nn.Module):
@@ -1257,8 +1257,8 @@ def test_two_way_transformer():
         "sam_two_way_transformer", image_embedding, state, nhwc_layout
     )
 
-    assert torch.allclose(result_queries, expected_queries, atol=1e-6, rtol=1e-4)
-    assert torch.allclose(result_keys, expected_keys, atol=1e-6, rtol=1e-4)
+    assert tensors_match(result_queries, expected_queries, atol=1e-6, rtol=1e-4)
+    assert tensors_match(result_keys, expected_keys, atol=1e-6, rtol=1e-4)
 
 
 class HypernetworkMLP(torch.nn.Module):
@@ -1297,7 +1297,7 @@ def test_hypernetwork_mlp():
 
     result = workbench.invoke_test("sam_hypernetwork_mlp", x, state)
 
-    assert torch.allclose(result, expected)
+    assert tensors_match(result, expected)
 
 
 def output_upscaling(transformer_dim: int, activation=torch.nn.GELU):
@@ -1325,8 +1325,7 @@ def test_output_upscaling():
     result = workbench.invoke_test("sam_output_upscaling", x, state, nhwc_layout, backend="vulkan")
     result = to_nchw(result)
 
-    workbench.print_results(result, expected)
-    assert torch.allclose(result, expected, rtol=0.1)  # fp16 weights
+    assert tensors_match(result, expected, rtol=0.1)  # fp16 weights
 
 
 class MaskDecoder(torch.nn.Module):
@@ -1465,5 +1464,5 @@ def test_predict_masks():
         "sam_predict_masks", image_embeddings, state, nhwc_layout, backend="vulkan"
     )
 
-    assert torch.allclose(result_masks, expected_masks, rtol=1e-2, atol=1e-2)
-    assert torch.allclose(result_iou_pred, iou_pred, rtol=1e-2)
+    assert tensors_match(result_masks, expected_masks, rtol=1e-2, atol=1e-2)
+    assert tensors_match(result_iou_pred, iou_pred, rtol=1e-2)
diff --git a/tests/workbench.cpp b/tests/workbench.cpp
@@ -154,6 +154,11 @@ DEF(sam_mlp)(model_ref m, span<tensor> input, param_dict const& p) {
 }
 
 DEF(sam_attention_rel_bias)(model_ref m, span<tensor> input, param_dict const& p) {
+    if (p.get("attn", "default") == "flash_attn"sv) {
+        m.flags = m.flags | model_build_flag::flash_attention;
+    } else {
+        m.flags = m.flags & ~model_build_flag::flash_attention;
+    }
     return {sam::attention_rel_bias(m, input[0], 4, 2)};
 }
 

Original file line number	Diff line number	Diff line change
`@@ -154,6 +154,11 @@ DEF(sam_mlp)(model_ref m, span<tensor> input, param_dict const& p) {`
`154`	`154`	`}`
`155`	`155`
`156`	`156`	`DEF(sam_attention_rel_bias)(model_ref m, span<tensor> input, param_dict const& p) {`
	`157`	`+ if (p.get("attn", "default") == "flash_attn"sv) {`
	`158`	`+ m.flags = m.flags \| model_build_flag::flash_attention;`
	`159`	`+ } else {`
	`160`	`+ m.flags = m.flags & ~model_build_flag::flash_attention;`
	`161`	`+ }`
`157`	`162`	`return {sam::attention_rel_bias(m, input[0], 4, 2)};`
`158`	`163`	`}`
`159`	`164`