sam: fuse batch norm into conv-2d

Acly · Acly · commit c93122ba930e · 2025-07-24T22:05:23.000+02:00
* model conversion fuses batch norm weights into conv-2d kernel
* inference just does conv-2d with bias
diff --git a/scripts/convert.py b/scripts/convert.py
@@ -79,7 +79,8 @@ def convert_sam(
         input_filepath, map_location="cpu", weights_only=True
     )
 
-    for name, tensor in model.items():
+    for key, tensor in model.items():
+        name = key
         name = name.replace("image_encoder.", "enc.")
         name = name.replace("mask_decoder.", "dec.")
         name = name.replace("_image_to_token.", "_i2t.")
@@ -92,14 +93,17 @@ def convert_sam(
             name = name + "_indexed"
             tensor = tensor[:, attention_bias_idxs]
 
-        if name.endswith("running_var"):
-            tensor = torch.sqrt(tensor + batch_norm_eps)
-
-        if (
-            name.endswith("c.weight")
-            or name.endswith("neck.0.weight")
-            or name.endswith("neck.2.weight")
-        ):
+        if name.endswith("c.weight"):
+            name = name.removesuffix(".c.weight")
+            weight, bias = fuse_conv_2d_batch_norm(model, key.removesuffix(".c.weight"))
+            weight = conv_2d_to_nhwc(weight)
+            add_tensor(writer, f"{name}.weight", weight, quantize, verbose)
+            add_tensor(writer, f"{name}.bias", bias, quantize, verbose)
+            continue
+        if ".bn." in name:
+            continue  # batch norm is fused above
+
+        if name.endswith("neck.0.weight") or name.endswith("neck.2.weight"):
             assert tensor.shape[2] == tensor.shape[3] and tensor.shape[2] <= 3
             tensor = conv_2d_to_nhwc(tensor)
 
@@ -115,6 +119,19 @@ def convert_sam(
         add_tensor(writer, name, tensor, data_type, verbose)
 
 
+def fuse_conv_2d_batch_norm(model: dict[str, Tensor], key: str):
+    conv_weight = model[f"{key}.c.weight"]
+    bn_weight = model[f"{key}.bn.weight"]
+    bn_bias = model[f"{key}.bn.bias"]
+    bn_mean = model[f"{key}.bn.running_mean"]
+    bn_var = model[f"{key}.bn.running_var"]
+
+    bn_weight = bn_weight / torch.sqrt(bn_var + batch_norm_eps)
+    fused_weight = conv_weight * bn_weight[:, None, None, None]
+    fused_bias = bn_bias - bn_mean * bn_weight
+    return fused_weight, fused_bias
+
+
 def build_attention_bias_indices(resolution: int):
     points = list(itertools.product(range(resolution), range(resolution)))
     N = len(points)
@@ -238,11 +255,7 @@ def convert_esrgan(
     "esrgan": "esrgan",
 }
 
-file_types = {
-    None: 0,
-    "f32": 0,
-    "f16": 1
-}
+file_types = {None: 0, "f32": 0, "f16": 1}
 
 if __name__ == "__main__":
     # fmt: off
@@ -269,7 +282,9 @@ def convert_esrgan(
 
     try:
         writer = GGUFWriter(output_path, arch_names.get(args.arch, args.arch))
-        metadata = Metadata.load(args.metadata, input_path.with_suffix(""), args.model_name)
+        metadata = Metadata.load(
+            args.metadata, input_path.with_suffix(""), args.model_name
+        )
 
         match args.arch:
             case "sam":
diff --git a/src/visp/mobile-sam.cpp b/src/visp/mobile-sam.cpp
@@ -59,33 +59,24 @@ tensor window_reverse(model_ref m, tensor x, int w, int h, int window) {
 // Image encoder
 //
 
-tensor conv_2d_batch_norm(model_ref m, tensor x, int stride, int pad, int groups) {
-    if (groups == 1) {
-        x = conv_2d(m["c"], x, stride, pad);
-    } else {
-        x = conv_2d_depthwise(m["c"], x, stride, pad);
-    }
-    x = batch_norm_2d(m["bn"], x);
-    return named(m, x);
-}
 
 tensor patch_embed(model_ref m, tensor x) {
-    x = conv_2d_batch_norm(m["seq.0"], x, 2, 1);
+    x = conv_2d(m["seq.0"], x, 2, 1);
     x = ggml_gelu_inplace(m, x);
-    x = conv_2d_batch_norm(m["seq.2"], x, 2, 1);
+    x = conv_2d(m["seq.2"], x, 2, 1);
     return named(m, x);
 }
 
 tensor mb_conv(model_ref m, tensor x) {
     tensor shortcut = x;
 
-    x = conv_2d_batch_norm(m["conv1"], x);
+    x = conv_2d(m["conv1"], x);
     x = ggml_gelu_inplace(m, x);
 
-    x = conv_2d_batch_norm(m["conv2"], x, 1, 1, /* groups */ int(x->ne[2]));
+    x = conv_2d_depthwise(m["conv2"], x, 1, 1);
     x = ggml_gelu_inplace(m, x);
 
-    x = conv_2d_batch_norm(m["conv3"], x);
+    x = conv_2d(m["conv3"], x);
     x = ggml_add_inplace(m, x, shortcut);
     x = ggml_gelu_inplace(m, x);
 
@@ -96,16 +87,16 @@ tensor patch_merging(model_ref m, tensor x, int input_resolution) {
     if (x->ne[2] == 1) {
         x = ggml_reshape_4d(m, x, x->ne[0], input_resolution, input_resolution, x->ne[3]);
     }
-    x = conv_2d_batch_norm(m["conv1"], x);
+    x = conv_2d(m["conv1"], x);
     x = ggml_gelu_inplace(m, x);
 
-    int c_out = int(m.weights("conv2.c.weight")->ne[0]);
+    int c_out = int(m.weights("conv2.weight")->ne[0]);
     int stride = (c_out == 320 || c_out == 448 || c_out == 576) ? 1 : 2;
-    x = conv_2d_batch_norm(m["conv2"], x, stride, 1, c_out);
+    x = conv_2d_depthwise(m["conv2"], x, stride, 1);
     x = ggml_gelu_inplace(m, x);
 
     auto [c, h, w, b] = nelements(x);
-    x = conv_2d_batch_norm(m["conv3"], x);
+    x = conv_2d(m["conv3"], x);
     x = ggml_reshape_3d(m, x, c, w * h, b);
     return named(m, x);
 }
@@ -175,7 +166,7 @@ tensor tiny_vit_block(
     x = ggml_add_inplace(m, x, res_x);
 
     x = ggml_reshape_4d(m, x, c, w, h, b);
-    x = conv_2d_batch_norm(m["local_conv"], x, 1, 1, /* groups */ dim);
+    x = conv_2d_depthwise(m["local_conv"], x, 1, 1);
     x = ggml_reshape_3d(m, x, c, spatial, b);
 
     tensor x_mlp = mlp(m["mlp"], x);
diff --git a/src/visp/mobile-sam.hpp b/src/visp/mobile-sam.hpp
@@ -39,7 +39,6 @@ struct tiny_vit_params {
 
 float resize_longest_side(i32x2 extent, int target_longest_side);
 
-tensor conv_2d_batch_norm(model_ref m, tensor x, int stride = 1, int pad = 0, int groups = 1);
 tensor patch_embed(model_ref m, tensor x);
 tensor mb_conv(model_ref m, tensor x);
 tensor patch_merging(model_ref m, tensor x, int input_resolution);
diff --git a/tests/test_mobile_sam.py b/tests/test_mobile_sam.py
@@ -30,10 +30,30 @@ def __init__(
         self.add_module("bn", bn)
 
 
-def add_variance_epsilon(state: dict[str, torch.Tensor], epsilon=1e-5):
-    for k in state:
-        if k.endswith("running_var"):
-            state[k] = torch.sqrt(state[k] + 1e-5).contiguous()
+def fuse_conv_2d_batch_norm(model: dict[str, Tensor], key: str, epsilon=1e-5):
+    conv_weight = model[f"{key}c.weight"]
+    bn_weight = model[f"{key}bn.weight"]
+    bn_bias = model[f"{key}bn.bias"]
+    bn_mean = model[f"{key}bn.running_mean"]
+    bn_var = model[f"{key}bn.running_var"]
+
+    bn_weight = bn_weight / torch.sqrt(bn_var + epsilon)
+    fused_weight = conv_weight * bn_weight[:, None, None, None]
+    fused_bias = bn_bias - bn_mean * bn_weight
+    return fused_weight, fused_bias
+
+
+def fuse_all_conv_2d_batch_norm(model: dict[str, Tensor]):
+    fused_weights = {}
+    for k in model:
+        if k.endswith("c.weight"):
+            key = k.removesuffix("c.weight")
+            weight, bias = fuse_conv_2d_batch_norm(model, key)
+            fused_weights[f"{key}weight"] = weight
+            fused_weights[f"{key}bias"] = bias
+        elif not k.endswith("num_batches_tracked"):
+            fused_weights[k] = model[k]
+    return fused_weights
 
 
 def test_conv_2d_batch_norm():
@@ -45,8 +65,8 @@ def test_conv_2d_batch_norm():
     x = torch.rand(1, 4, 8, 8)
     expected = conv2dbn(x)
 
-    add_variance_epsilon(state)
-    convert_to_nhwc(state)
+    state = fuse_all_conv_2d_batch_norm(state)
+    state = convert_to_nhwc(state)
     x = to_nhwc(x)
     result = workbench.invoke_test("sam_conv_2d_batch_norm", x, state)
     result = to_nchw(result)
@@ -89,7 +109,7 @@ def test_patch_embed():
     x = torch.rand(1, 3, 8, 8)
     expected = patch_embed(x)
 
-    add_variance_epsilon(state)
+    state = fuse_all_conv_2d_batch_norm(state)
     convert_to_nhwc(state)
     x = to_nhwc(x)
     result = to_nhwc(torch.zeros_like(expected))
@@ -184,7 +204,7 @@ def test_mb_conv():
     x = torch.rand(1, 4, 8, 8)
     expected = mb_conv(x)
 
-    add_variance_epsilon(state)
+    state = fuse_all_conv_2d_batch_norm(state)
     convert_to_nhwc(state)
     x = to_nhwc(x)
     result = workbench.invoke_test("sam_mb_conv", x, state)
@@ -235,10 +255,10 @@ def test_patch_merging():
     x = torch.rand(1, 8, 32, 32)
     expected = patch_merging(x)
 
-    add_variance_epsilon(state)
+    state = fuse_all_conv_2d_batch_norm(state)
     convert_to_nhwc(state)
     x = to_nhwc(x)
-    result = result = workbench.invoke_test("sam_patch_merging", x, state)
+    result = workbench.invoke_test("sam_patch_merging", x, state)
     result = result.transpose(1, 2).reshape_as(expected)
 
     # precision: ggml_gelu uses fp16 look-up table & tanh approximation
@@ -497,9 +517,9 @@ def test_tiny_vit_block():
     state["attn.attention_biases_indexed"] = state["attn.attention_biases"][
         :, tiny_vit_block.attn.attention_bias_idxs
     ]
-    add_variance_epsilon(state)
-    convert_to_nhwc(state)
-    result = result = workbench.invoke_test("sam_tiny_vit_block", x, state)
+    state = fuse_all_conv_2d_batch_norm(state)
+    state = convert_to_nhwc(state)
+    result = workbench.invoke_test("sam_tiny_vit_block", x, state)
 
     assert torch.allclose(result, expected, rtol=0.001, atol=0.02)
 
diff --git a/tests/workbench.cpp b/tests/workbench.cpp
@@ -118,7 +118,7 @@ DEF(linear)(model_ref m, span<tensor> input, param_dict const& p) {
 // Mobile SAM
 
 DEF(sam_conv_2d_batch_norm)(model_ref m, span<tensor> input, param_dict const& p) {
-    return {sam::conv_2d_batch_norm(m, input[0], 2, 1)};
+    return {conv_2d(m, input[0], 2, 1)}; // fused conv_2d + batch_norm
 }
 
 DEF(sam_patch_embed)(model_ref m, span<tensor> input, param_dict const& p) {
diff --git a/tests/workbench.py b/tests/workbench.py
@@ -90,7 +90,9 @@ def encode_params(params: dict[str, str | int | float]):
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 
 root_dir = Path(__file__).parent.parent
-lib = ctypes.CDLL(str(root_dir / "build" / "bin" / "vision-workbench.dll"))
+bin_dir = root_dir / "build" / "bin"
+
+lib = ctypes.CDLL(str(bin_dir / "vision-workbench.dll"))
 lib.visp_workbench.argtypes = [
     ctypes.c_char_p,
     ctypes.POINTER(RawTensor),
@@ -174,15 +176,15 @@ def to_nchw(tensor: torch.Tensor):
     return tensor.permute(0, 3, 1, 2).contiguous()
 
 
-def convert_to_nhwc(state: dict[str, torch.Tensor], key="c."):
+def convert_to_nhwc(state: dict[str, torch.Tensor], key=""):
     for k, v in state.items():
         is_conv = (
             v.ndim == 4
             and v.shape[2] == v.shape[3]
             and v.shape[2] in (1, 3, 4, 7)
             and k.endswith("weight")
         )
-        if key in k and is_conv:
+        if is_conv and (key == "" or key in k):
             if v.shape[1] == 1:  # depthwise
                 state[k] = v.permute(2, 3, 1, 0).contiguous()
             else:

Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,7 @@ DEF(linear)(model_ref m, span<tensor> input, param_dict const& p) {`
`118`	`118`	`// Mobile SAM`
`119`	`119`
`120`	`120`	`DEF(sam_conv_2d_batch_norm)(model_ref m, span<tensor> input, param_dict const& p) {`
`121`		`- return {sam::conv_2d_batch_norm(m, input[0], 2, 1)};`
	`121`	`+ return {conv_2d(m, input[0], 2, 1)}; // fused conv_2d + batch_norm`
`122`	`122`	`}`
`123`	`123`
`124`	`124`	`DEF(sam_patch_embed)(model_ref m, span<tensor> input, param_dict const& p) {`