birefnet: switch to direct version of conv_2d_deform on Vulkan

Acly · Acly · commit 87f3d7667ca7 · 2025-08-12T16:00:14.000+02:00
* times old -&gt; new / new with coopmat2
* birefnet: 268ms -&gt; 315ms / 243ms
* birefnet-lite: 109ms -&gt; 119 / 87ms
* deform conv2d is a bit slower without coopmat2 support, but also requires much less vram, so still worth it
diff --git a/README.md b/README.md
@@ -192,9 +192,9 @@ as other frameworks for inference speed, but with:
 | Model |      |      | _vision.cpp_ |  PyTorch | ONNX Runtime |
 | :---- | :--- | :--- | -----------: | -------: | -----------: |
 | Full  | cpu  | f32  |     16333 ms | 18800 ms |              |
-| Full  | gpu  | f16  |       268 ms |   140 ms |              |
+| Full  | gpu  | f16  |       243 ms |   140 ms |              |
 | Lite  | cpu  | f32  |      4505 ms | 10900 ms |      6978 ms |
-| Lite  | gpu  | f16  |       109 ms |    59 ms |              |
+| Lite  | gpu  | f16  |        86 ms |    59 ms |              |
 
 #### MI-GAN, 512x512
 
@@ -205,7 +205,7 @@ as other frameworks for inference speed, but with:
 
 #### Setup
 
-* vision.cpp: using vision-bench, GPU via Vulkan, eg. `vision-bench sam cpu`
+* vision.cpp: using vision-bench, GPU via Vulkan, eg. `vision-bench -m sam -b cpu`
 * PyTorch: v2.7.1+cu128, eager eval, GPU via CUDA, average n iterations after warm-up
 
 ## Dependencies (integrated)
diff --git a/depend/ggml b/depend/ggml
@@ -1 +1 @@
-Subproject commit 28437efd5e22a797b65318913f38483f1f508b09
+Subproject commit d7d8f437ec6c532cc97c44a3f8d6305a8ba058ef
diff --git a/src/visp/nn.cpp b/src/visp/nn.cpp
@@ -154,13 +154,6 @@ tensor conv_2d_deform(
     
     if (m.flags & model_build_flag::cwhn) {
         x = permute_whcn_to_cwhn(m, x);
-    } else if (!(m.flags & model_build_flag::f16_conv_transpose)) {
-        // Vulkan WHCN implementation doesn't do the final permute atm
-        // only worth fixing if WHCN ends up faster AND we dont implement
-        // a direct version of conv_2d_deform
-        auto [w, h, c, n] = nelements(x);
-        x = ggml_reshape_4d(m, x, c, w, h, n);
-        x = ggml_cont(m, ggml_permute(m, x, 2, 0, 1, 3));
     }
     return x;
 }
diff --git a/tests/test_birefnet.py b/tests/test_birefnet.py
@@ -761,13 +761,15 @@ def test_encode():
 @pytest.mark.parametrize("backend", ["cpu", "vulkan"])
 def test_conv_2d_deform(scenario: str, memory_layout: str, backend: str):
     torch.manual_seed(42)
+    if memory_layout == "nhwc" and backend == "vulkan":
+        pytest.skip("conv_2d_deform with nhwc layout is not supported on Vulkan")
 
     w, h, c_in, c_out, k = {
         "small": (4, 4, 5, 2, 3),
         "large": (49, 38, 81, 17, 7),
     }[scenario]
-    x = input_tensor(1, c_in, h, w) - 0.5
-    weight = input_tensor(c_out, c_in, k, k)
+    x = torch.rand(1, c_in, h, w) - 0.5
+    weight = torch.rand(c_out, c_in, k, k) - 0.5
     offset = 1.0 - input_tensor(1, 2 * k * k, h, w)
     mask = torch.rand(1, k * k, h, w)
     expected = torchvision.ops.deform_conv2d(x, offset, weight, mask=mask, padding=(k // 2, k // 2))
@@ -785,7 +787,7 @@ def test_conv_2d_deform(scenario: str, memory_layout: str, backend: str):
     if memory_layout == "nhwc":
         result = to_nchw(result)
 
-    assert torch.allclose(result, expected, atol=1e-2 if backend == "vulkan" else 1e-5)
+    assert torch.allclose(result, expected, atol=0.1 if backend == "vulkan" else 0.001)
 
 
 class DeformableConv2d(nn.Module):
@@ -916,8 +918,6 @@ def test_global_avg_pool(backend: str):
 
     state = fuse_all_conv_2d_batch_norm(state, "", "1", "2")
     state = convert_to_nhwc(state, key="1.weight")
-    for k, v in state.items():
-        print(f"{k}: {v.shape}")
     x = to_nhwc(x)
     result = workbench.invoke_test("biref_global_avg_pool", x, state, nhwc_layout, backend=backend)
     result = to_nchw(result)
diff --git a/tests/workbench.cpp b/tests/workbench.cpp
@@ -240,8 +240,8 @@ DEF(biref_relative_position_index)(model_ref m, span<tensor> input, param_dict c
 DEF(biref_window_attention)(model_ref m, span<tensor> input, param_dict const& p) {
     int window_size = 3;
     tensor mask = m.find("mask");
-    auto rel_pos_index = birefnet::create_relative_position_index(m.weights_context, window_size);
-    ggml_backend_alloc_ctx_tensors(m.weights_context, workbench_backend());
+    auto rel_pos_index = birefnet::create_relative_position_index(m, window_size);
+    ggml_backend_alloc_ctx_tensors(m, workbench_backend());
     transfer_to_backend(rel_pos_index);
     return {birefnet::window_attention(m, input[0], mask, 2, window_size)};
 }
@@ -254,8 +254,8 @@ DEF(biref_swin_block)(model_ref m, span<tensor> input, param_dict const& p) {
     block.h = 6;
     block.shift = 0;
     tensor mask = m.find("mask");
-    auto rel_pos_index = birefnet::create_relative_position_index(m.weights_context, 3);
-    ggml_backend_alloc_ctx_tensors(m.weights_context, workbench_backend());
+    auto rel_pos_index = birefnet::create_relative_position_index(m, 3);
+    ggml_backend_alloc_ctx_tensors(m, workbench_backend());
     transfer_to_backend(rel_pos_index);
     return {birefnet::swin_block(m, input[0], mask, block)};
 }
@@ -276,9 +276,11 @@ DEF(biref_swin_layer)(model_ref m, span<tensor> input, param_dict const& p) {
     layer.n_heads = 2;
     layer.n_features = 8;
     layer.downsample = true;
-    auto rel_pos_index = birefnet::create_relative_position_index(m.weights_context, 3);
-    ggml_backend_alloc_ctx_tensors(m.weights_context, workbench_backend());
+    auto rel_pos_index = birefnet::create_relative_position_index(m, 3);
+    auto attn_mask = birefnet::create_attention_mask(m, 6, 6, 3);
+    ggml_backend_alloc_ctx_tensors(m, workbench_backend());
     transfer_to_backend(rel_pos_index);
+    transfer_to_backend(attn_mask);
     auto result = birefnet::swin_layer(m, input[0], 6, 6, layer, 3);
     ASSERT(result.w_down == 3 && result.h_down == 3);
     return {result.x_down};
@@ -294,11 +296,11 @@ DEF(biref_swin_transformer)(model_ref m, span<tensor> input, param_dict const& p
             swin_layer_t{2, 4, 8 * 4, true},
             swin_layer_t{2, 2, 8 * 8, false},
         }};
-    auto rel_pos_index = birefnet::create_relative_position_index(m.weights_context, 3);
+    auto rel_pos_index = birefnet::create_relative_position_index(m, 3);
     auto attn_masks = std::array{
-        birefnet::create_attention_mask(m.weights_context, 8, 8, 3), birefnet::create_attention_mask(m.weights_context, 4, 4, 3),
-        birefnet::create_attention_mask(m.weights_context, 2, 2, 3), birefnet::create_attention_mask(m.weights_context, 1, 1, 3)};
-    ggml_backend_alloc_ctx_tensors(m.weights_context, workbench_backend());
+        birefnet::create_attention_mask(m, 8, 8, 3), birefnet::create_attention_mask(m, 4, 4, 3),
+        birefnet::create_attention_mask(m, 2, 2, 3), birefnet::create_attention_mask(m, 1, 1, 3)};
+    ggml_backend_alloc_ctx_tensors(m, workbench_backend());
     transfer_to_backend(rel_pos_index);
     for (auto&& attn_mask : attn_masks) {
         transfer_to_backend(attn_mask);

Original file line number	Diff line number	Diff line change
`@@ -154,13 +154,6 @@ tensor conv_2d_deform(`
`154`	`154`
`155`	`155`	`if (m.flags & model_build_flag::cwhn) {`
`156`	`156`	`x = permute_whcn_to_cwhn(m, x);`
`157`		`- } else if (!(m.flags & model_build_flag::f16_conv_transpose)) {`
`158`		`- // Vulkan WHCN implementation doesn't do the final permute atm`
`159`		`- // only worth fixing if WHCN ends up faster AND we dont implement`
`160`		`- // a direct version of conv_2d_deform`
`161`		`- auto [w, h, c, n] = nelements(x);`
`162`		`- x = ggml_reshape_4d(m, x, c, w, h, n);`
`163`		`- x = ggml_cont(m, ggml_permute(m, x, 2, 0, 1, 3));`
`164`	`157`	`}`
`165`	`158`	`return x;`
`166`	`159`	`}`