diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 631420d..c8573aa 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -37,6 +37,13 @@ jobs:
         if: matrix.os == 'windows-latest'
         uses: microsoft/setup-msbuild@v2
 
+      - name: Vulkan SDK (Windows)
+        if: matrix.os == 'windows-latest'
+        uses: humbletim/install-vulkan-sdk@v1.2
+        with:
+          version: 1.4.309.0
+          cache: true
+
       - name: Configure (Linux)
         if: matrix.os == 'ubuntu-22.04'
         run: >
@@ -52,6 +59,7 @@ jobs:
           cmake . -B build -A x64
           -D CMAKE_BUILD_TYPE=Release
           -D VISP_CI=ON
+          -D VISP_VULKAN=ON
 
       - name: Configure (MacOS)
         if: matrix.os == 'macos-14'
@@ -74,8 +82,7 @@ jobs:
       #     export GGML_VK_VISIBLE_DEVICES=0
       #     ctest --verbose
 
-      - name: Test CPU
-        if: matrix.os != 'ubuntu-22.04'
+      - name: Test
         working-directory: ./build
         run: ctest --verbose -C Release
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e462d1..3e5493f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,8 @@ if(PROJECT_IS_TOP_LEVEL)
   set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 endif()
 
+# Configure assertions
+
 if(VISP_DEV)
   set(VISP_ASSERT "VISP_ASSERT_BREAK")
 elseif(VISP_CI)
@@ -28,6 +30,8 @@ elseif(CMAKE_BUILD_TYPE)
   endif()
 endif()
 
+# Configure address sanitizer (Clang only)
+
 if(VISP_ASAN)
   if(MSVC)
     add_compile_options(/fsanitize=address)
@@ -38,12 +42,20 @@ if(VISP_ASAN)
   endif()
 endif()
 
-if(MSVC)  
-  add_compile_options(/Zi /utf-8)
-  add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
-  add_link_options(/DEBUG) # Enable debug symbols also in release builds
+# Windows/MSVC specific defaults
+
+if(MSVC)
+  list(APPEND VISP_COMP_OPTIONS /utf-8)
+  list(APPEND VISP_DEFINITIONS _CRT_SECURE_NO_WARNINGS)
+  if(PROJECT_IS_TOP_LEVEL)
+    # Enable debug symbols also in release builds
+    list(APPEND VISP_COMP_OPTIONS /Zi)
+    list(APPEND VISP_LINK_OPTIONS /DEBUG)
+  endif()
 endif()
 
+# Configure warnings
+
 if(VISP_DEV OR VISP_CI)
   if(MSVC)
     set(VISP_WARNINGS /W4 /WX /wd4251)
@@ -59,7 +71,7 @@ add_subdirectory(depend/stb)
 if(VISP_FMT_LIB)
   add_subdirectory(depend/fmt)
   set(VISP_FMT_LINK fmt::fmt)
-  set(VISP_FMT_DEFS VISP_FMT_LIB)
+  list(APPEND VISP_DEFINITIONS VISP_FMT_LIB)
 endif()
 
 set(GGML_VULKAN ${VISP_VULKAN})
diff --git a/README.md b/README.md
index db180fa..a89a3df 100644
--- a/README.md
+++ b/README.md
@@ -48,11 +48,11 @@ Pass `--composite output.png` to composite input and mask. Use `--help` for more
 #### API
 
 ```c++
-#include <visp/vision.hpp>
+#include <visp/vision.h>
 using namespace visp;
 
 void main() {
-  backend   cpu = backend_init(backend_type::cpu);
+  backend_device cpu = backend_init(backend_type::cpu);
   sam_model sam = sam_load_model("MobileSAM-F16.gguf", cpu);
   
   image_data input_image = image_load("input.jpg");
@@ -180,32 +180,32 @@ as other frameworks for inference speed, but with:
 * CPU: AMD Ryzen 5 5600X (6 cores)
 * GPU: NVIDIA GeForce RTX 4070
 
-#### MobileSAM, 1024x1024, encode + decode
+#### MobileSAM, 1024x1024
 
-|      |      | _vision.cpp_ |     PyTorch | ONNX Runtime |
-| :--- | :--- | -----------: | ----------: | -----------: |
-| cpu  | f32  |  632 + 37 ms | 559 + 42 ms |  728 + 87 ms |
-| gpu  | f16  |   18 +  3 ms |  10 +  6 ms |              |
+|      |      | _vision.cpp_ | PyTorch | ONNX Runtime |
+| :--- | :--- | -----------: | ------: | -----------: |
+| cpu  | f32  |       669 ms |  601 ms |       805 ms |
+| gpu  | f16  |        19 ms |   16 ms |              |
 
 #### BiRefNet, 1024x1024
 
 | Model |      |      | _vision.cpp_ |  PyTorch | ONNX Runtime |
 | :---- | :--- | :--- | -----------: | -------: | -----------: |
 | Full  | cpu  | f32  |     16333 ms | 18800 ms |              |
-| Full  | gpu  | f16  |       380 ms |   140 ms |              |
+| Full  | gpu  | f16  |       243 ms |   140 ms |              |
 | Lite  | cpu  | f32  |      4505 ms | 10900 ms |      6978 ms |
-| Lite  | gpu  | f16  |       204 ms |    59 ms |       967 ms |
+| Lite  | gpu  | f16  |        86 ms |    59 ms |              |
 
 #### MI-GAN, 512x512
 
 | Model       |      |      | _vision.cpp_ | PyTorch |
 | :---------- | :--- | :--- | -----------: | ------: |
 | 512-places2 | cpu  | f32  |       523 ms |  637 ms |
-| 512-places2 | gpu  | f16  |        24 ms |   17 ms |
+| 512-places2 | gpu  | f16  |        21 ms |   17 ms |
 
 #### Setup
 
-* vision.cpp: using vision-bench, GPU via Vulkan, eg. `vision-bench sam cpu`
+* vision.cpp: using vision-bench, GPU via Vulkan, eg. `vision-bench -m sam -b cpu`
 * PyTorch: v2.7.1+cu128, eager eval, GPU via CUDA, average n iterations after warm-up
 
 ## Dependencies (integrated)
diff --git a/depend/ggml b/depend/ggml
index 41982e7..f77c43a 160000
--- a/depend/ggml
+++ b/depend/ggml
@@ -1 +1 @@
-Subproject commit 41982e7b5985250c9322bdbdde0ab91bfd4e27f7
+Subproject commit f77c43aadfd9a552bbd2c2c5160e8caf85fe0288
diff --git a/docs/model-implementation-guide.md b/docs/model-implementation-guide.md
index 3ea9c17..698f83d 100644
--- a/docs/model-implementation-guide.md
+++ b/docs/model-implementation-guide.md
@@ -51,7 +51,7 @@ PyTorch code.
 The great thing about ggml is, you can always follow-reference in your IDE and
 see almost immediately how things are implemented. It is small enough to be
 compiled along-side, so you can step into functions, add prints, etc. If some
-functionality is missing, you can quickly hack it in. Make sure to use.
+functionality is missing, you can quickly hack it in. Make sure to use that.
 
 ### vision.cpp
 
@@ -68,7 +68,7 @@ tensor some_module(model_ref m, tensor x, ...)
 Here `tensor` is short for `ggml_tensor *`, which can be a weight or the result
 of an operation. The `model_ref` is used to build a compute graph by passing it
 to ggml functions as replacement for `ggml_context *`. It keeps track of parent
-modules and provides a way to access model weights. 
+modules and provides a way to access model weights by name. 
 
 `some_module` typically represents the forward function of a PyTorch
 `nn.Module`. The whole model can be defined with reusable functions.
@@ -108,7 +108,8 @@ be converted. It's usually a good opportunity to optimize for inference, throw
 away training-only stuff, maybe fuse some operations, or convert to a faster
 memory layout.
 
-If you haven't already, setup a Python environment (just running `uv sync` will do).
+If you haven't already, setup a Python environment (I use
+[uv](https://docs.astral.sh/uv/) and simply run `uv sync`).
 
 Open `scripts/convert.py` and add a conversion function similar to the existing
 ones. A 1:1 conversion is very simple:
diff --git a/include/visp/image.hpp b/include/visp/image.h
similarity index 96%
rename from include/visp/image.hpp
rename to include/visp/image.h
index 234bf94..a2b01fd 100644
--- a/include/visp/image.hpp
+++ b/include/visp/image.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "visp/util.hpp"
+#include "visp/util.h"
 
 #include <memory>
 #include <span>
@@ -97,9 +97,12 @@ struct image_data {
     std::unique_ptr<uint8_t[]> data;
 };
 
-// Allocate image data. Pixels are not initialized.
+// Allocate image data. Memory is not initialized!
 VISP_API image_data image_alloc(i32x2 extent, image_format format);
 
+// Set all pixels to zero.
+void image_clear(image_span const&);
+
 // Load image from file (PNG, JPEG, etc.)
 VISP_API image_data image_load(char const* filepath);
 
@@ -194,6 +197,7 @@ struct VISP_API tile_layout {
 VISP_API tile_layout tile_scale(tile_layout const&, int scale);
 
 // Merge a tile into the destination image. Both images must be rgb_f32 format.
+// Blends pixels from `tile` and `dst` in overlap regions. `dst` must be all zeros initially.
 VISP_API void tile_merge(
     image_view const& tile, image_span const& dst, i32x2 tile_coord, tile_layout const& layout);
 
diff --git a/include/visp/ml.hpp b/include/visp/ml.h
similarity index 75%
rename from include/visp/ml.hpp
rename to include/visp/ml.h
index b0290d3..2a3826d 100644
--- a/include/visp/ml.hpp
+++ b/include/visp/ml.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "visp/image.hpp"
-#include "visp/util.hpp"
+#include "visp/image.h"
+#include "visp/util.h"
 
 #include <ggml-alloc.h>
 #include <ggml-backend.h>
@@ -13,6 +13,8 @@
 #include <limits>
 #include <memory>
 #include <span>
+#include <string>
+#include <string_view>
 #include <vector>
 
 namespace visp {
@@ -21,30 +23,74 @@ using std::span;
 using tensor_name = fixed_string<GGML_MAX_NAME>;
 using tensor = ggml_tensor*;
 
+// Memory layout, especially for weights of 2D operations like convolutions
+enum tensor_data_layout { unknown, whcn, cwhn };
+
 //
-// Backend
+// Backend device - represents the compute hardware
 
 enum class backend_type { cpu = 1, gpu = 2 };
 
 // True if the backend library is loaded and has at least one supported device.
 VISP_API bool backend_is_available(backend_type);
 
-struct VISP_API backend_device {
+struct backend_device {
     ggml_backend_ptr handle;
     ggml_backend_dev_t device;
 
-    backend_type type() const;
-    ggml_type preferred_float_type() const;
-    size_t total_memory() const;
+    VISP_API backend_type type() const;
+    VISP_API ggml_type preferred_float_type() const;
+    VISP_API tensor_data_layout preferred_layout() const;
+    VISP_API size_t total_memory() const;
 
     operator ggml_backend_t() const { return handle.get(); }
 };
 
+// Initialize a backend device, automatically tries to pick the "best" available.
 VISP_API backend_device backend_init();
+
+// Initialize the most suited device that matches the specified backend type.
 VISP_API backend_device backend_init(backend_type);
 
+// Set number of threads used by the backend (CPU only).
 VISP_API void backend_set_n_threads(backend_device&, int n_threads);
 
+//
+// Model build flags - backend capabilities, model configuration and optimization
+
+enum class model_build_flag {
+    // clang-format off
+    cwhn                = 1 << 0,
+    conv_2d_direct_cwhn = 1 << 1,
+    concat_n            = 1 << 2,
+    f16_conv_transpose  = 1 << 3,
+    window_partition    = 1 << 4
+}; // clang-format on
+
+using model_build_flags = flags<model_build_flag>;
+
+VISP_API model_build_flags backend_default_flags(backend_type);
+
+//
+// Model file - holds the contents of a GGUF file
+
+struct model_file {
+    gguf_context_ptr gguf;
+    ggml_context_ptr data;
+    std::string path;
+
+    VISP_API int64_t n_tensors() const;
+    VISP_API std::string_view arch() const;
+    VISP_API tensor_data_layout tensor_layout() const;
+
+    VISP_API int64_t key(char const* name) const;
+    VISP_API int get_int(char const* name) const;
+    VISP_API std::string_view get_string(char const* name) const;
+};
+
+// Opens a .gguf file and reads its contents into memory.
+VISP_API model_file model_load(char const* filepath);
+
 //
 // Model weights
 //
@@ -52,33 +98,36 @@ VISP_API void backend_set_n_threads(backend_device&, int n_threads);
 // * holds the backend buffers for model weight data
 // * holds buffers for extra tensors such as pre-computed lookup tables
 
-struct VISP_API model_weights {
+struct model_weights {
     ggml_context_ptr context;
     backend_type buffer_type = backend_type::cpu;
     ggml_backend_buffer_ptr weights_buffer;
     std::vector<ggml_backend_buffer_ptr> extra_buffers;
+    model_build_flags flags;
 
-    ggml_type float_type() const;
+    VISP_API ggml_type float_type() const;
 
     operator ggml_context*() const { return context.get(); }
 };
 
 // Creates a GGML context with storage for a fixed number of tensors.
 // Does not allocate any backend buffers.
-VISP_API model_weights model_init(backend_device const&, size_t n_tensors);
-
-struct model_load_params {
-    ggml_type float_type = GGML_TYPE_COUNT; // default: use type stored in GGUF file
-    int n_extra_tensors = 0;                // number of extra tensors to allocate in the context
-};
-
-// Loads model weights from a GGUF file and transfers them to backend buffers.
-VISP_API model_weights model_load(char const* filepath, backend_device const&, model_load_params = {});
+VISP_API model_weights model_init(size_t n_tensors);
 
 // Allocates backend buffers for the model weights if needed. Does not transfer data.
 // Returns false and does nothing if all tensors already have an associated backend buffer.
 VISP_API bool model_allocate(model_weights&, backend_device const&);
 
+// Adds model weights contained in `file` to `weights`. Allocates backend buffers for the
+// weights on `device` and transfers the data to the device buffer.
+// Optionally converts float weights to the specified data type during transfer.
+VISP_API void model_transfer(
+    model_file const& file,
+    model_weights& weights,
+    backend_device const& device,
+    ggml_type float_type = GGML_TYPE_COUNT,
+    tensor_data_layout = tensor_data_layout::unknown);
+
 //
 // Compute graph - wrapper for ggml_cgraph and its associated backend memory
 
@@ -107,18 +156,6 @@ VISP_API void compute(compute_graph const&, backend_device const&);
 //   to support nested modules
 // * pass anywhere ggml_context* is expected while building the graph
 
-enum class model_build_flag {
-    // clang-format off
-    cwhn                = 1 << 0,
-    conv_2d_direct      = 1 << 1,
-    fused_batch_norm    = 1 << 2,
-    concat_n            = 1 << 3,
-    f16_conv_transpose  = 1 << 4,
-    window_partition    = 1 << 5
-}; // clang-format on
-
-using model_build_flags = flags<model_build_flag>;
-
 struct VISP_API model_ref {
     ggml_context* weights_context = nullptr;
     ggml_context* graph_context = nullptr;
@@ -127,8 +164,8 @@ struct VISP_API model_ref {
     tensor_name prefix;
 
     model_ref() = default;
-    model_ref(model_weights& m);
-    model_ref(model_weights& m, compute_graph& g);
+    model_ref(model_weights&);
+    model_ref(model_weights&, compute_graph&);
 
     explicit model_ref(
         ggml_context* weights_context,
@@ -247,7 +284,7 @@ struct swin_params {
 
 extern swin_params const swin_t_params;
 extern swin_params const swin_l_params;
-VISP_API swin_params swin_detect_params(model_ref);
+VISP_API swin_params swin_detect_params(model_file const&);
 
 //
 // implementation
@@ -256,4 +293,8 @@ constexpr model_build_flags operator|(model_build_flag lhs, model_build_flag rhs
     return model_build_flags(uint32_t(lhs) | uint32_t(rhs));
 }
 
+constexpr model_build_flags operator~(model_build_flag f) {
+    return ~model_build_flags(f);
+}
+
 } // namespace visp
diff --git a/include/visp/util.hpp b/include/visp/util.h
similarity index 83%
rename from include/visp/util.hpp
rename to include/visp/util.h
index 99c008f..74e4826 100644
--- a/include/visp/util.hpp
+++ b/include/visp/util.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cstdint>
 #include <exception>
 #include <string_view>
 
@@ -138,25 +139,23 @@ struct flags {
     explicit constexpr flags(uint32_t value) : value(value) {}
 
     flags& operator|=(E other) {
-        value |= other;
+        value |= uint32_t(other);
         return *this;
     }
-
-    friend constexpr bool operator&(flags<E> lhs, E rhs) {
-        return (lhs.value & uint32_t(rhs)) != 0;
+    
+    flags& operator|=(flags other) {
+        value |= other.value;
+        return *this;
     }
 
-    friend constexpr bool operator&(flags<E> lhs, flags<E> rhs) {
-        return (lhs.value & rhs.value) != 0;
-    }
+    constexpr flags operator~() const { return flags(~value); }
+    explicit constexpr operator bool() const { return value != 0; }
 
-    friend constexpr flags<E> operator|(flags<E> lhs, E rhs) {
-        return flags<E>(lhs.value | uint32_t(rhs));
-    }
+    friend constexpr flags operator&(flags lhs, E rhs) { return flags(lhs.value & uint32_t(rhs)); }
+    friend constexpr flags operator&(flags lhs, flags rhs) { return flags(lhs.value & rhs.value); }
 
-    friend constexpr flags<E> operator|(flags<E> lhs, flags<E> rhs) {
-        return flags<E>(lhs.value | rhs.value);
-    }
+    friend constexpr flags operator|(flags lhs, E rhs) { return flags(lhs.value | uint32_t(rhs)); }
+    friend constexpr flags operator|(flags lhs, flags rhs) { return flags(lhs.value | rhs.value); }
 };
 
 } // namespace visp
diff --git a/include/visp/vision.hpp b/include/visp/vision.h
similarity index 93%
rename from include/visp/vision.hpp
rename to include/visp/vision.h
index a85b178..8fa4211 100644
--- a/include/visp/vision.hpp
+++ b/include/visp/vision.h
@@ -8,21 +8,21 @@
 //
 // Vision.cpp comes in 3 main headers:
 //
-// visp/image.hpp
+// visp/image.h
 //
 //   Defines structures to store and reference pixel data. Supports loading, saving and
 //   common processing of images. Most tasks take an `image_view` as input, which
 //   is a non-owning reference to external pixel data. Output is returned as
 //   `image_data` (allocated by the library) or written to an `image_span`.
 //
-// visp/ml.hpp
+// visp/ml.h
 //
 //   Contains ML infrastructure shared between all models: loading weights,
 //   transferring data between backend devices (eg. GPU), and executing
 //   compute graphs. Most of these are thin convenience wrappers around GGML.
 //   Alternatively you can use GGML directly for greater flexibility.
 //
-// visp/vision.hpp (this file)
+// visp/vision.h (this file)
 //
 //   Provides a high-level API to run inference on various vision models for
 //   common tasks. These operations are built for simplicity and don't provide
@@ -70,9 +70,9 @@
 
 #pragma once
 
-#include "visp/image.hpp"
-#include "visp/ml.hpp"
-#include "visp/util.hpp"
+#include "visp/image.h"
+#include "visp/ml.h"
+#include "visp/util.h"
 
 #include <array>
 #include <span>
@@ -142,14 +142,17 @@ VISP_API image_data birefnet_compute(birefnet_model&, image_view image);
 // --- BiRefNet pipeline
 
 struct birefnet_params {
-    int image_size = 1024;
+    int image_size = 1024; // can be -1 for dynamic size
+    int image_multiple = 32;
+    i32x2 image_extent = {1024, 1024}; // required if image_size is -1
     swin_params encoder;
 };
 
 using birefnet_buffers = std::array<tensor_data, swin_params::n_layers + 2>;
 
-VISP_API birefnet_params birefnet_detect_params(model_ref);
+VISP_API birefnet_params birefnet_detect_params(model_file const&, i32x2 dynamic_extent = {});
 VISP_API birefnet_buffers birefnet_precompute(model_ref, birefnet_params const&);
+VISP_API i32x2 birefnet_image_extent(i32x2 input_extent, birefnet_params const&);
 
 VISP_API image_data birefnet_process_input(image_view, birefnet_params const&);
 VISP_API image_data birefnet_process_output(
@@ -176,7 +179,7 @@ struct migan_params {
     bool invert_mask = false;
 };
 
-VISP_API migan_params migan_detect_params(model_ref m);
+VISP_API migan_params migan_detect_params(model_file const&);
 
 VISP_API image_data migan_process_input(image_view image, image_view mask, migan_params const&);
 VISP_API image_data migan_process_output(
@@ -204,7 +207,7 @@ struct esrgan_params {
     int n_blocks = 23;
 };
 
-VISP_API esrgan_params esrgan_detect_params(model_ref);
+VISP_API esrgan_params esrgan_detect_params(model_file const&);
 VISP_API int esrgan_estimate_graph_size(esrgan_params const&);
 
 VISP_API tensor esrgan_generate(model_ref, tensor image, esrgan_params const&);
diff --git a/models/CMakeLists.txt b/models/CMakeLists.txt
index 7e72a01..d1afb96 100644
--- a/models/CMakeLists.txt
+++ b/models/CMakeLists.txt
@@ -4,27 +4,27 @@ message(STATUS "Checking for models/MobileSAM-F16.gguf")
 file(DOWNLOAD
   "https://huggingface.co/Acly/MobileSAM-GGUF/resolve/main/MobileSAM-F16.gguf"
   ${CMAKE_CURRENT_LIST_DIR}/MobileSAM-F16.gguf
-  EXPECTED_HASH "SHA256=1e392f58a0e518b7e1e9e5a43403ff0c6d001aeefa6f4e4d2bdf60f7bbe6e4f2"
+  EXPECTED_HASH "SHA256=b546366475e3ad744bb2eaf7634df88e9aaf25f6622797d2de300f5a530831f7"
   SHOW_PROGRESS
 )
 message(STATUS "Checking for models/BiRefNet-lite-F16.gguf")
 file(DOWNLOAD
   "https://huggingface.co/Acly/BiRefNet-GGUF/resolve/main/BiRefNet-lite-F16.gguf"
   ${CMAKE_CURRENT_LIST_DIR}/BiRefNet-lite-F16.gguf
-  EXPECTED_HASH "SHA256=f038843ea7c44a859491df96c7b36815143f7de77b13cbfc0dae5f6eae863fb5"
+  EXPECTED_HASH "SHA256=7b5397a2c98d66677f8f74317774bbeac49dbb321b8a3dc744af913db71d4fa5"
   SHOW_PROGRESS
 )
 message(STATUS "Checking for models/MIGAN-512-places2-F16.gguf")
 file(DOWNLOAD
   "https://huggingface.co/Acly/MIGAN-GGUF/resolve/main/MIGAN-512-places2-F16.gguf"
   ${CMAKE_CURRENT_LIST_DIR}/MIGAN-512-places2-F16.gguf
-  EXPECTED_HASH "SHA256=c9f241e96fb5a791f9494fc7d4c2dd793297ae95f05b8423f547d19bea465b81"
+  EXPECTED_HASH "SHA256=3e47592bf716d0dc306f8dc02d4476cfcdaf2c055fa3c3c8e0ced4db775eb64b"
   SHOW_PROGRESS
 )
 message(STATUS "Checking for models/RealESRGAN-x4plus_anime-6B-F16.gguf")
 file(DOWNLOAD
   "https://huggingface.co/Acly/Real-ESRGAN-GGUF/resolve/main/RealESRGAN-x4plus_anime-6B-F16.gguf"
   ${CMAKE_CURRENT_LIST_DIR}/RealESRGAN-x4plus_anime-6B-F16.gguf
-  EXPECTED_HASH "SHA256=b741e68720d7ad6251dee2120bf7579ef816ea16da18299b39f6cbcb0e13ecf0"
+  EXPECTED_HASH "SHA256=730469c5a2269cdef96d0d58aacf87bcf25d7a0d92256685808e6cdce0675c09"
   SHOW_PROGRESS
 )
\ No newline at end of file
diff --git a/scripts/convert.py b/scripts/convert.py
index 9bc6a7c..054bf42 100644
--- a/scripts/convert.py
+++ b/scripts/convert.py
@@ -19,6 +19,7 @@
 import safetensors
 import numpy as np
 
+from enum import Enum
 from pathlib import Path
 from gguf import GGUFWriter, Metadata, GGML_QUANT_VERSION
 from torch import Tensor
@@ -27,11 +28,29 @@
 # Common
 
 
+class TensorLayout(Enum):
+    unknown = "unknown"
+    nchw = "whcn"
+    nhwc = "cwhn"
+
+    @staticmethod
+    def parse(s: str):
+        if s == "whcn" or s == "nchw":
+            return TensorLayout.nchw
+        if s == "cwhn" or s == "nhwc":
+            return TensorLayout.nhwc
+        return TensorLayout.unknown
+
+
 class Writer(GGUFWriter):
     def __init__(self, path: Path, arch_name: str, float_type: str, verbose: bool):
         super().__init__(path, arch_name)
+        self.arch = arch_name
         self.float_type = float_type
+        self.tensor_layout = TensorLayout.unknown
         self.verbose = verbose
+        self.conv2d_weights: list[int] = []
+        self._index = 0
 
     def add_tensor(self, name: str, tensor: Tensor, float_type: str | None = None):
         if len(name) >= 64:
@@ -45,6 +64,33 @@ def add_tensor(self, name: str, tensor: Tensor, float_type: str | None = None):
         if self.verbose:
             print(name, tensor.shape, tensor_data.dtype)
         super().add_tensor(name, tensor_data)
+        self._index += 1
+
+    def convert_tensor_2d(self, tensor: Tensor):
+        # assume tensor is NCHW layout (PyTorch default)
+        if self.tensor_layout is TensorLayout.nhwc:
+            return conv_2d_to_nhwc(tensor)
+        else:
+            # add tensor index to list to optionally convert layout on the fly later
+            self.conv2d_weights.append(self._index)
+            return tensor
+
+    def add_int32(self, name: str, value: int):
+        print("*", name, "=", value)
+        super().add_int32(name, value)
+
+    def set_tensor_layout(self, layout: TensorLayout):
+        print("*", f"{self.arch}.tensor_data_layout", "=", layout.value)
+        self.tensor_layout = layout
+        self.add_tensor_data_layout(layout.value)
+
+    def set_tensor_layout_default(self, layout: TensorLayout):
+        if self.tensor_layout is TensorLayout.unknown:
+            self.set_tensor_layout(layout)
+
+    def add_conv2d_weight_indices(self):
+        if self.conv2d_weights:
+            self.add_array(f"{self.arch}.conv2d_weights", self.conv2d_weights)
 
 
 batch_norm_eps = 1e-5
@@ -124,7 +170,8 @@ def fuse_conv_2d_batch_norm(
         fused_weight = conv_weight * bn_weight[:, None, None, None]
         fused_bias = (conv_bias - bn_mean) * bn_weight + bn_bias
 
-        writer.add_tensor(name, conv_2d_to_nhwc(fused_weight))
+        fused_weight = writer.convert_tensor_2d(fused_weight)
+        writer.add_tensor(name, fused_weight)
         writer.add_tensor(name.replace("weight", "bias"), fused_bias)
         return True
 
@@ -135,7 +182,7 @@ def fuse_conv_2d_batch_norm(
     elif suffix_norm in key:
         return True  # batch norm was fused above
 
-    return False  # no match
+    return False  # tensor is not part of conv2d+batch-norm
 
 
 #
@@ -144,6 +191,7 @@ def fuse_conv_2d_batch_norm(
 
 def convert_sam(input_filepath: Path, writer: Writer):
     writer.add_license("apache-2.0")
+    writer.set_tensor_layout_default(TensorLayout.nchw)
 
     model: dict[str, Tensor] = torch.load(input_filepath, map_location="cpu", weights_only=True)
 
@@ -161,12 +209,19 @@ def convert_sam(input_filepath: Path, writer: Writer):
             name = name + "_indexed"
             tensor = tensor[:, attention_bias_idxs]
 
+        if "local_conv" in key:  # always convert to nhwc
+            original_tensor_layout = writer.tensor_layout
+            writer.tensor_layout = TensorLayout.nhwc
+            fuse_conv_2d_batch_norm(model, key, name, "", "c", "bn", writer)
+            writer.tensor_layout = original_tensor_layout
+            continue
+
         if fuse_conv_2d_batch_norm(model, key, name, "", "c", "bn", writer):
             continue
 
         if name.endswith("neck.0.weight") or name.endswith("neck.2.weight"):
             assert tensor.shape[2] == tensor.shape[3] and tensor.shape[2] <= 3
-            tensor = conv_2d_to_nhwc(tensor)
+            tensor = writer.convert_tensor_2d(tensor)
 
         # Precompute dense positional embeddings from random matrix stored in the model
         if name == "prompt_encoder.pe_layer.positional_encoding_gaussian_matrix":
@@ -221,10 +276,29 @@ def build_dense_positional_embeddings(
 
 def convert_birefnet(input_filepath: Path, writer: Writer):
     writer.add_license("mit")
+    writer.set_tensor_layout_default(TensorLayout.nchw)
 
     weights = safetensors.safe_open(input_filepath, "pt")
     model: dict[str, Tensor] = {k: weights.get_tensor(k) for k in weights.keys()}
 
+    x = model["bb.layers.0.blocks.0.attn.proj.bias"]
+    if x.shape[0] == 96:
+        writer.add_string("swin.config", "tiny")
+        writer.add_int32("swin.embed_dim", 96)
+    elif x.shape[0] == 192:
+        writer.add_string("swin.config", "large")
+        writer.add_int32("swin.embed_dim", 192)
+    else:
+        raise ValueError(f"Unsupported Swin Transformer embed dim: {x.shape[0]}")
+
+    image_size = 1024
+    if "HR" in input_filepath.name or "2K" in input_filepath.name:
+        image_size = 2048  # actually 2K should rather be 2560x1440
+    elif "dynamic" in input_filepath.name:
+        image_size = -1
+    writer.add_int32("birefnet.image_size", image_size)
+    writer.add_int32("birefnet.image_multiple", 128)
+
     for key, tensor in model.items():
         # Shorten some names to fit into 64 chars
         name = key
@@ -259,7 +333,10 @@ def convert_birefnet(input_filepath: Path, writer: Writer):
             continue  # batch norm was fused
 
         if is_conv_2d(name, tensor):
-            tensor = conv_2d_to_nhwc(tensor)
+            if "patch_embed" in name:  # part of SWIN, always store as NHWC
+                tensor = conv_2d_to_nhwc(tensor)
+            else:  # store rest in requested tensor layout
+                tensor = writer.convert_tensor_2d(tensor)
 
         writer.add_tensor(name, tensor)
 
@@ -270,12 +347,18 @@ def convert_birefnet(input_filepath: Path, writer: Writer):
 
 def convert_migan(input_filepath: Path, writer: Writer):
     writer.add_license("mit")
+    writer.set_tensor_layout_default(TensorLayout.nchw)
 
     model: dict[str, Tensor] = torch.load(input_filepath, weights_only=True)
 
+    if "encoder.b512.fromrgb.weight" in model:
+        writer.add_int32("migan.image_size", 512)
+    elif "encoder.b256.fromrgb.weight" in model:
+        writer.add_int32("migan.image_size", 256)
+
     for name, tensor in model.items():
         if is_conv_2d(name, tensor):
-            tensor = conv_2d_to_nhwc(tensor)
+            tensor = writer.convert_tensor_2d(tensor)
 
         writer.add_tensor(name, tensor)
 
@@ -296,10 +379,17 @@ def convert_esrgan(input_filepath: Path, writer: Writer):
     if getattr(model.model, "plus", False):
         raise ValueError("RealESRGAN+ (plus) models are not supported yet.")
 
+    writer.set_tensor_layout_default(TensorLayout.nchw)
+    writer.add_int32("esrgan.scale", model.scale)
+    for tag in model.tags:
+        if tag.endswith("nb"):
+            writer.add_int32("esrgan.block_count", int(tag[:-2]))
+        if tag.endswith("nf"):
+            writer.add_int32("esrgan.filter_count", int(tag[:-2]))
+
     for name, tensor in model.model.state_dict().items():
         if is_conv_2d(name, tensor):
-            tensor = conv_2d_to_nhwc(tensor)
-
+            tensor = writer.convert_tensor_2d(tensor)
         writer.add_tensor(name, tensor)
 
 
@@ -319,10 +409,11 @@ def convert_esrgan(input_filepath: Path, writer: Writer):
 if __name__ == "__main__":
     # fmt: off
     parser = argparse.ArgumentParser(description="Convert model weights (.pt/.pth/.safetensors) to GGUF format.")
-    parser.add_argument("arch", choices=["sam", "birefnet", "migan", "esrgan"], help="Model architecture")
+    parser.add_argument("arch", choices=list(arch_names.keys()), help="Model architecture")
     parser.add_argument("input", type=str, help="Path to the input model file")
     parser.add_argument("--output", "-o", type=str, default="models", help="Path to the output directory or file")
     parser.add_argument("--quantize", "-q", choices=["f16"], default=None, help="Convert float weights to the specified data type")
+    parser.add_argument("--layout", "-l", choices=["whcn", "cwhn"], default=None, help="Tensor data layout for 2D operations like convolution")
     parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose output")
     parser.add_argument("--model-name", type=str, default=None, help="Name of the model for metadata")
     parser.add_argument("--metadata", type=Path, help="Specify the path for an authorship metadata override file")
@@ -332,8 +423,9 @@ def convert_esrgan(input_filepath: Path, writer: Writer):
     input_path = Path(args.input)
     output_path = Path(args.output)
     quant_suffix = f"-{args.quantize.upper()}" if args.quantize else ""
+    layout_suffix = f"-{args.layout.upper()}" if args.layout else ""
     if output_path.is_dir() or output_path.suffix != ".gguf":
-        output_path = output_path / f"{input_path.stem}{quant_suffix}.gguf"
+        output_path = output_path / f"{input_path.stem}{quant_suffix}{layout_suffix}.gguf"
 
     print(f"Converting {args.arch}")
     print("* input: ", input_path)
@@ -348,6 +440,9 @@ def convert_esrgan(input_filepath: Path, writer: Writer):
         )
         metadata = Metadata.load(args.metadata, input_path.with_suffix(""), args.model_name)
 
+        if args.layout is not None:
+            writer.set_tensor_layout(TensorLayout.parse(args.layout))
+
         match args.arch:
             case "sam":
                 convert_sam(input_path, writer)
@@ -362,8 +457,8 @@ def convert_esrgan(input_filepath: Path, writer: Writer):
 
         metadata.set_gguf_meta_model(writer)
         writer.add_quantization_version(GGML_QUANT_VERSION)
-        writer.add_tensor_data_layout("cwhn")
         writer.add_file_type(file_types[args.quantize])
+        writer.add_conv2d_weight_indices()
         writer.write_header_to_file()
         writer.write_kv_data_to_file()
         writer.write_tensors_to_file(progress=True)
diff --git a/src/cli/CMakeLists.txt b/src/cli/CMakeLists.txt
index 53fc9a0..aaf21d2 100644
--- a/src/cli/CMakeLists.txt
+++ b/src/cli/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_executable(vision-cli)
 target_sources(vision-cli PRIVATE cli.cpp)
 target_include_directories(vision-cli PRIVATE ..)
-target_compile_definitions(vision-cli PRIVATE ${VISP_ASSERT} ${VISP_FMT_DEFS})
-target_compile_options(vision-cli PRIVATE ${VISP_WARNINGS})
+target_compile_definitions(vision-cli PRIVATE ${VISP_ASSERT} ${VISP_DEFINITIONS})
+target_compile_options(vision-cli PRIVATE ${VISP_WARNINGS} ${VISP_COMP_OPTIONS})
+target_link_options(vision-cli PRIVATE ${VISP_LINK_OPTIONS})
 target_link_libraries(vision-cli PRIVATE visioncpp ${VISP_FMT_LINK})
\ No newline at end of file
diff --git a/src/cli/cli.cpp b/src/cli/cli.cpp
index ba53776..25f2ee9 100644
--- a/src/cli/cli.cpp
+++ b/src/cli/cli.cpp
@@ -1,6 +1,6 @@
-#include "util/math.hpp"
-#include "util/string.hpp"
-#include "visp/vision.hpp"
+#include "util/math.h"
+#include "util/string.h"
+#include "visp/vision.h"
 
 #include <algorithm>
 #include <charconv>
@@ -32,7 +32,7 @@ struct cli_args {
 };
 
 void print_usage() {
-char const* const usage = R"(
+    char const* const usage = R"(
 Usage: vision-cli <command> [options]
 
 Commands:
@@ -181,6 +181,7 @@ int main(int argc, char** argv) {
             case cli_command::birefnet: run_birefnet(args); break;
             case cli_command::migan: run_migan(args); break;
             case cli_command::esrgan: run_esrgan(args); break;
+            case cli_command::none: break;
         }
 
     } catch (std::exception const& e) {
@@ -231,22 +232,38 @@ backend_device backend_init(cli_args const& args) {
     return b;
 }
 
-model_weights load_model_weights(
-    cli_args const& args, backend_device const& b, char const* default_model, int n_tensors = 0) {
+char const* to_string(tensor_data_layout l) {
+    switch (l) {
+        case tensor_data_layout::cwhn: return "cwhn";
+        case tensor_data_layout::whcn: return "whcn";
+        default: return "unknown";
+    }
+}
+
+std::tuple<model_file, model_weights> load_model_weights(
+    cli_args const& args,
+    backend_device const& dev,
+    char const* default_model,
+    int n_tensors = 0,
+    tensor_data_layout preferred_layout = tensor_data_layout::unknown) {
 
     timer t;
     char const* model_path = args.model ? args.model : default_model;
     printf("Loading model weights from '%s'... ", model_path);
 
-    model_load_params load_params = {
-        .float_type = b.preferred_float_type(),
-        .n_extra_tensors = n_tensors,
-    };
-    model_weights weights = model_load(model_path, b, load_params);
+    model_file file = model_load(model_path);
+    model_weights weights = model_init(file.n_tensors() + n_tensors);
+    if (preferred_layout == tensor_data_layout::unknown) {
+        preferred_layout = file.tensor_layout();
+    }
+    model_transfer(file, weights, dev, dev.preferred_float_type(), preferred_layout);
 
     printf("done (%s)\n", t.elapsed_str());
     printf("- float type: %s\n", ggml_type_name(weights.float_type()));
-    return weights;
+    if (preferred_layout != tensor_data_layout::unknown) {
+        printf("- tensor layout: %s\n", to_string(preferred_layout));
+    }
+    return {std::move(file), std::move(weights)};
 }
 
 void compute_timed(compute_graph const& g, backend_device const& b) {
@@ -323,7 +340,8 @@ sam_prompt sam_parse_prompt(std::span<char const* const> args, i32x2 extent) {
 
 void run_sam(cli_args const& args) {
     backend_device backend = backend_init(args);
-    model_weights weights = load_model_weights(args, backend, "models/MobileSAM-F16.gguf");
+    auto [file, weights] = load_model_weights(
+        args, backend, "models/MobileSAM-F16.gguf", 0, backend.preferred_layout());
     sam_params params{};
 
     require_inputs(args.inputs, 1, "<image>");
@@ -376,33 +394,36 @@ void run_sam(cli_args const& args) {
 
 void run_birefnet(cli_args const& args) {
     backend_device backend = backend_init(args);
-    model_weights weights = load_model_weights(args, backend, "models/BiRefNet-F16.gguf", 6);
-    birefnet_params params = birefnet_detect_params(weights);
-    int img_size = params.image_size;
+    auto [file, weights] = load_model_weights(
+        args, backend, "models/BiRefNet-F16.gguf", 0, backend.preferred_layout());
 
     require_inputs(args.inputs, 1, "<image>");
     image_data image = image_load(args.inputs[0]);
+    birefnet_params params = birefnet_detect_params(file, image.extent);
     image_data input_data = birefnet_process_input(image, params);
 
-    birefnet_buffers buffers = birefnet_precompute(model_ref(weights), params);
-    model_allocate(weights, backend);
-    for (tensor_data const& buf : buffers) {
-        transfer_to_backend(buf);
-    }
+    i32x2 extent = params.image_extent;
+    char const* image_size_str = params.image_size < 0 ? " (dynamic)" : "";
+    printf("- model image size: %d%s\n", params.image_size, image_size_str);
+    printf("- inference image size: %dx%d\n", extent[0], extent[1]);
 
     compute_graph graph = compute_graph_init(6 * 1024);
     model_ref m(weights, graph);
 
-    tensor input = compute_graph_input(m, GGML_TYPE_F32, {3, img_size, img_size, 1});
+    birefnet_buffers buffers = birefnet_precompute(m, params);
+    tensor input = compute_graph_input(m, GGML_TYPE_F32, {3, extent[0], extent[1], 1});
     tensor output = birefnet_predict(m, input, params);
 
     compute_graph_allocate(graph, backend);
     transfer_to_backend(input, input_data);
+    for (tensor_data const& buf : buffers) {
+        transfer_to_backend(buf);
+    }
 
     compute_timed(graph, backend);
 
     tensor_data mask_data = transfer_from_backend(output);
-    image_view mask_output({img_size, img_size}, mask_data.as_f32());
+    image_view mask_output(extent, mask_data.as_f32());
     image_data mask_resized = image_scale(mask_output, image.extent);
     image_data mask = image_f32_to_u8(mask_resized, image_format::alpha_u8);
     image_save(mask, args.output);
@@ -416,8 +437,9 @@ void run_birefnet(cli_args const& args) {
 
 void run_migan(cli_args const& args) {
     backend_device backend = backend_init(args);
-    model_weights weights = load_model_weights(args, backend, "models/MIGAN-512-places2-F16.gguf");
-    migan_params params = migan_detect_params(weights);
+    auto [file, weights] = load_model_weights(
+        args, backend, "models/MIGAN-512-places2-F16.gguf", backend.preferred_layout());
+    migan_params params = migan_detect_params(file);
     params.invert_mask = true; // -> inpaint opaque areas
 
     require_inputs(args.inputs, 2, "<image> <mask>");
@@ -453,8 +475,11 @@ void run_migan(cli_args const& args) {
 
 void run_esrgan(cli_args const& args) {
     backend_device backend = backend_init(args);
-    model_weights weights = load_model_weights(args, backend, "models/RealESRGAN-x4.gguf");
-    esrgan_params params = esrgan_detect_params(weights);
+    auto [file, weights] = load_model_weights(
+        args, backend, "models/RealESRGAN-x4.gguf", 0, backend.preferred_layout());
+    esrgan_params params = esrgan_detect_params(file);
+    printf("- scale: %dx\n", params.scale);
+    printf("- block count: %d\n", params.n_blocks);
 
     require_inputs(args.inputs, 1, "<image>");
     image_data image = image_load(args.inputs[0]);
@@ -465,6 +490,7 @@ void run_esrgan(cli_args const& args) {
     image_data input_tile = image_alloc(tiles.tile_size, image_format::rgb_f32);
     image_data output_tile = image_alloc(tiles_out.tile_size, image_format::rgb_f32);
     image_data output_image = image_alloc(image.extent * params.scale, image_format::rgb_f32);
+    image_clear(output_image);
 
     compute_graph graph = compute_graph_init(esrgan_estimate_graph_size(params));
     model_ref m(weights, graph);
diff --git a/src/util/math.hpp b/src/util/math.h
similarity index 95%
rename from src/util/math.hpp
rename to src/util/math.h
index 02e290d..835229d 100644
--- a/src/util/math.hpp
+++ b/src/util/math.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include "visp/util.hpp"
+#include "visp/util.h"
 
 #include <algorithm>
-#include <span>
+#include <cmath>
 
 namespace visp {
 using std::clamp;
@@ -12,6 +12,8 @@ using std::clamp;
 constexpr int32_t div_ceil(int32_t a, int32_t b) { return (a + b - 1) / b; }
 constexpr int64_t div_ceil(int64_t a, int64_t b) { return (a + b - 1) / b; }
 
+constexpr int32_t next_multiple(int32_t x, int32_t mult) { return div_ceil(x, mult) * mult; }
+
 constexpr float sqr(float x) { return x * x; }
 constexpr int sqr(int x) { return x * x; }
 
diff --git a/src/util/string.hpp b/src/util/string.h
similarity index 99%
rename from src/util/string.hpp
rename to src/util/string.h
index a0b61a9..220b751 100644
--- a/src/util/string.hpp
+++ b/src/util/string.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "visp/util.hpp"
+#include "visp/util.h"
 
 #include <cstdio>
 #include <utility>
diff --git a/src/visp/CMakeLists.txt b/src/visp/CMakeLists.txt
index d9ecb4e..5cdbd54 100644
--- a/src/visp/CMakeLists.txt
+++ b/src/visp/CMakeLists.txt
@@ -12,21 +12,22 @@ target_sources(visioncpp PRIVATE
   vision.cpp
 )
 target_compile_features(visioncpp PUBLIC cxx_std_20)
-target_compile_definitions(visioncpp PRIVATE VISP_API_EXPORT ${VISP_ASSERT} ${VISP_FMT_DEFS})
-target_compile_options(visioncpp PRIVATE ${VISP_WARNINGS})
+target_compile_definitions(visioncpp PRIVATE VISP_API_EXPORT ${VISP_ASSERT} ${VISP_DEFINITIONS})
+target_compile_options(visioncpp PRIVATE ${VISP_WARNINGS} ${VISP_COMP_OPTIONS})
 target_include_directories(visioncpp PUBLIC
   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
   $<INSTALL_INTERFACE:include>
   PRIVATE ..
 )
+target_link_options(visioncpp PRIVATE ${VISP_LINK_OPTIONS})
 target_link_libraries(visioncpp
   PUBLIC ggml
   PRIVATE stb ${VISP_FMT_LINK}
 )
-set_target_properties(visioncpp PROPERTIES
-  VERSION ${PROJECT_VERSION}
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
+# set_target_properties(visioncpp PROPERTIES
+#   VERSION ${PROJECT_VERSION}
+#   SOVERSION ${PROJECT_VERSION_MAJOR}
+# )
 
 if (MSVC AND VISP_TESTS)
   set_target_properties(visioncpp PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
diff --git a/src/visp/arch/birefnet.cpp b/src/visp/arch/birefnet.cpp
index ec2d780..9c9ad37 100644
--- a/src/visp/arch/birefnet.cpp
+++ b/src/visp/arch/birefnet.cpp
@@ -1,13 +1,11 @@
-#include "visp/arch/birefnet.hpp"
-#include "visp/nn.hpp"
-#include "visp/vision.hpp"
-#include "util/math.hpp"
-#include "util/string.hpp"
+#include "visp/arch/birefnet.h"
+#include "util/math.h"
+#include "util/string.h"
+#include "visp/nn.h"
+#include "visp/vision.h"
 
 #include <ggml.h>
 
-#include <optional>
-
 namespace visp {
 namespace birefnet {
 
@@ -86,8 +84,8 @@ tensor window_attention(model_ref m, tensor x, tensor mask, int num_heads, int w
 
     tensor attn = ggml_mul_mat(m, k, q);
 
-    tensor rel_pos_index =
-        m.with_prefix(format<tensor_name>("window_attention_{}", window)).weights("rel_pos_index");
+    tensor_name rel_pos_name = format<tensor_name>("window_attention_{}.rel_pos_index", window);
+    tensor rel_pos_index = ggml_get_tensor(m, rel_pos_name.c_str());
     tensor rel_pos_table = m.weights("relative_position_bias_table");
     tensor rel_pos_bias = ggml_get_rows(m, rel_pos_table, rel_pos_index);
     rel_pos_bias = ggml_reshape_4d(m, rel_pos_bias, num_heads, window * window, window * window, 1);
@@ -235,18 +233,18 @@ tensor_data create_attention_mask(ggml_context* ctx, int64_t w, int64_t h, int w
 swin_layer_result swin_layer(
     model_ref m, tensor x, int64_t w, int64_t h, swin_layer_t const& p, int window_size) {
     // Attention masks need to be precomputed
-    tensor attn_mask =
-        m.with_prefix(format<tensor_name>("swin_layer_{}x{}", w, h)).find("attn_mask");
+    tensor_name attn_mask_name = format<tensor_name>("swin_layer_{}x{}.attn_mask", w, h);
+    tensor attn_mask = ggml_get_tensor(m, attn_mask_name.c_str());
 
     model_ref blocks = m["blocks"];
     for (int i = 0; i < p.depth; ++i) {
-        swin_block_params block_params = {
-            .n_heads = p.n_heads,
-            .window_size = window_size,
-            .w = w,
-            .h = h,
-            .shift = i % 2 == 0 ? 0 : window_size / 2};
-        x = swin_block(blocks[i], x, attn_mask, block_params);
+        x = swin_block(
+            blocks[i], x, attn_mask,
+            {.n_heads = p.n_heads,
+             .window_size = window_size,
+             .w = w,
+             .h = h,
+             .shift = i % 2 == 0 ? 0 : window_size / 2});
     }
     if (p.downsample) {
         tensor x_down = patch_merging(m["downsample"], x, w, h);
@@ -258,6 +256,7 @@ swin_layer_result swin_layer(
 tensor patch_embed(model_ref m, tensor x, int patch_size) {
     ASSERT(x->ne[1] % patch_size == 0 && x->ne[2] % patch_size == 0);
 
+    m.flags |= model_build_flag::cwhn;
     x = conv_2d(m["proj"], x, patch_size);
     auto [c, ww, wh, b] = nelements(x);
     x = ggml_reshape_3d(m, x, c, ww * wh, b);
@@ -287,17 +286,19 @@ swin_result swin_transformer(model_ref m, tensor x, swin_params const& p) {
     return outs;
 }
 
-constexpr int32_t bilinear_align_corners = GGML_SCALE_MODE_BILINEAR | (int)GGML_SCALE_FLAG_ALIGN_CORNERS;
+constexpr int32_t bilinear_align_corners = GGML_SCALE_MODE_BILINEAR |
+    (int)GGML_SCALE_FLAG_ALIGN_CORNERS;
 
 tensor upscale_to_whcn(model_ref m, tensor x, tensor target) {
     return interpolate(m, x, {target->ne[0], target->ne[1]}, bilinear_align_corners);
 }
 
 tensor upscale_to(model_ref m, tensor x, tensor target) {
-    x = permute_cwhn_to_whcn(m, x);
-    x = interpolate(m, x, {target->ne[1], target->ne[2]}, bilinear_align_corners);
-    x = permute_whcn_to_cwhn(m, x);
-    return ggml_cont(m, x);
+    auto [target_width, target_height, c, n] = nelements_whcn(m, target);
+    x = contiguous_2d_to_whcn(m, x);
+    x = interpolate(m, x, {target_width, target_height}, bilinear_align_corners);
+    x = whcn_to_contiguous_2d(m, x);
+    return x;
 }
 
 tensor downscale_by_whcn(model_ref m, tensor x, int f) {
@@ -305,34 +306,32 @@ tensor downscale_by_whcn(model_ref m, tensor x, int f) {
 }
 
 tensor downscale_by(model_ref m, tensor x, int f) {
-    x = permute_cwhn_to_whcn(m, x);
+    x = ggml_cont(m, permute_cwhn_to_whcn(m, x));
     x = downscale_by_whcn(m, x, f);
-    x = permute_whcn_to_cwhn(m, x);
-    return ggml_cont(m, x);
+    x = ggml_cont(m, permute_whcn_to_cwhn(m, x));
+    return x;
 }
 
 swin_result encode_concat(model_ref m, swin_result& xs, swin_result& xs_low) {
     // TODO: implement cwhn upscale/interpolate which allows downscale & align_corners=True
-    // cwhn -> whcn
     for (int i = 0; i < 4; ++i) {
-        xs[i] = ggml_cont(m, ggml_permute(m, xs[i], 2, 0, 1, 3));
-        xs_low[i] = ggml_permute(m, xs_low[i], 2, 0, 1, 3);
+        xs[i] = ggml_cont(m, permute_cwhn_to_whcn(m, xs[i]));
+        xs_low[i] = permute_cwhn_to_whcn(m, xs_low[i]);
     }
-
+    // clang-format off
     xs[0] = concat(m, {xs[0], upscale_to_whcn(m, xs_low[0], xs[0])}, 2);
     xs[1] = concat(m, {xs[1], upscale_to_whcn(m, xs_low[1], xs[1])}, 2);
     xs[2] = concat(m, {xs[2], upscale_to_whcn(m, xs_low[2], xs[2])}, 2);
     xs[3] = concat(m, {xs[3], upscale_to_whcn(m, xs_low[3], xs[3])}, 2);
+    xs[3] = concat(m, {downscale_by_whcn(m, xs[0], 8),
+                       downscale_by_whcn(m, xs[1], 4),
+                       downscale_by_whcn(m, xs[2], 2),
+                       xs[3]}, /*dim = */ 2);
+    // clang-format on
 
-    xs[3] = concat(
-        m,
-        {downscale_by_whcn(m, xs[0], 8), downscale_by_whcn(m, xs[1], 4),
-         downscale_by_whcn(m, xs[2], 2), xs[3]},
-        /*dim = */ 2);
-
-    // whcn -> cwhn
+    // whcn -> native
     for (int i = 0; i < 4; ++i) {
-        xs[i] = ggml_cont(m, ggml_permute(m, xs[i], 1, 2, 0, 3));
+        xs[i] = whcn_to_contiguous_2d(m, xs[i]);
     }
     return xs;
 }
@@ -364,12 +363,11 @@ tensor deformable_conv_2d(model_ref m, tensor x, int stride, int pad) {
 }
 
 tensor mean_2d(model_ref m, tensor x) {
-    auto [c, w, h, n] = nelements(x);
-    x = ggml_cont(m, ggml_permute(m, x, 2, 0, 1, 3)); // cwhn -> whcn
-    x = ggml_mean(m, x);
-    x = ggml_reshape_3d(m, x, h, c, n);
+    auto [w, h, c, n] = nelements_whcn(m, x);
+    x = contiguous_2d_to_whcn(m, x);
+    x = ggml_reshape_3d(m, x, w * h, c, n);
     x = ggml_mean(m, x);
-    x = ggml_reshape_4d(m, x, c, 1, 1, n);
+    x = is_cwhn(m) ? ggml_reshape_4d(m, x, c, 1, 1, n) : ggml_reshape_4d(m, x, 1, 1, c, n);
     return x;
 }
 
@@ -389,6 +387,7 @@ tensor aspp_module_deformable(model_ref m, tensor x, int padding) {
 
 tensor aspp_deformable(model_ref m, tensor x) {
     const int kernel_sizes[] = {1, 3, 7};
+    const int channel_dim = is_cwhn(m) ? 0 : 2;
 
     tensor x1 = aspp_module_deformable(m["aspp1"], x);
     model_ref aspp_deforms = m["aspp_deforms"];
@@ -398,10 +397,11 @@ tensor aspp_deformable(model_ref m, tensor x) {
         x_deforms[i] = aspp_module_deformable(aspp_deforms[i], x, padding);
     }
     tensor x5 = global_avg_pool(m["global_avg_pool"], x);
-    x5 = permute_cwhn_to_whcn(m, x5);
-    x5 = interpolate(m, x5, {x1->ne[1], x1->ne[2]}, bilinear_align_corners);
-    x5 = ggml_cont(m, permute_whcn_to_cwhn(m, x5));
-    x = concat(m, {x1, x_deforms[0], x_deforms[1], x_deforms[2], x5}, 0);
+    auto [w1, h1, c, n] = nelements_whcn(m, x1);
+    x5 = contiguous_2d_to_whcn(m, x5);
+    x5 = interpolate(m, x5, {w1, h1}, bilinear_align_corners);
+    x5 = whcn_to_contiguous_2d(m, x5);
+    x = concat(m, {x1, x_deforms[0], x_deforms[1], x_deforms[2], x5}, channel_dim);
 
     x = conv_2d_batch_norm(m["conv1"], x);
     x = ggml_relu_inplace(m, x);
@@ -440,17 +440,22 @@ tensor gdt_conv(model_ref m, tensor x) {
 }
 
 tensor decode(model_ref m, tensor x, swin_result const& features) {
+    const int channel_dim = is_cwhn(m) ? 0 : 2;
+
     tensor x1 = features[0];
     tensor x2 = features[1];
     tensor x3 = features[2];
     tensor x4 = features[3];
-    tensor x_whcn = ggml_cont(m, ggml_permute(m, x, 2, 0, 1, 3)); // cwhn -> whcn
-
+    tensor x_whcn = ggml_cont(m, permute_cwhn_to_whcn(m, x));
+    if (is_whcn(m)) {
+        x = x_whcn;
+    }
     {
-        tensor patches = image_to_patches(m, x_whcn, x4->ne[1], x4->ne[2]);
-        patches = ggml_cont(m, ggml_permute(m, patches, 1, 2, 0, 3)); // whcn -> cwhn
+        auto [w, h, c, n] = nelements_whcn(m, x4);
+        tensor patches = image_to_patches(m, x_whcn, w, h);
+        patches = whcn_to_contiguous_2d(m, patches);
         patches = simple_conv(m["ipt_blk5"], patches);
-        x4 = ggml_concat(m, x4, patches, 0);
+        x4 = ggml_concat(m, x4, patches, channel_dim);
     }
     tensor p4 = basic_decoder_block(m["block4"], x4);
     tensor p4_gdt = gdt_conv(m["gdt_convs_4"], p4);
@@ -463,10 +468,11 @@ tensor decode(model_ref m, tensor x, swin_result const& features) {
     tensor _p3 = ggml_add_inplace(m, _p4, x3);
 
     {
-        tensor patches = image_to_patches(m, x_whcn, _p3->ne[1], _p3->ne[2]);
-        patches = ggml_cont(m, ggml_permute(m, patches, 1, 2, 0, 3)); // whcn -> cwhn
+        auto [w, h, c, n] = nelements_whcn(m, _p3);
+        tensor patches = image_to_patches(m, x_whcn, w, h);
+        patches = whcn_to_contiguous_2d(m, patches);
         patches = simple_conv(m["ipt_blk4"], patches);
-        _p3 = ggml_concat(m, _p3, patches, 0);
+        _p3 = ggml_concat(m, _p3, patches, channel_dim);
     }
     tensor p3 = basic_decoder_block(m["block3"], _p3);
     tensor p3_gdt = gdt_conv(m["gdt_convs_3"], p3);
@@ -479,10 +485,11 @@ tensor decode(model_ref m, tensor x, swin_result const& features) {
     tensor _p2 = ggml_add_inplace(m, _p3, x2);
 
     {
-        tensor patches = image_to_patches(m, x_whcn, _p2->ne[1], _p2->ne[2]);
-        patches = ggml_cont(m, ggml_permute(m, patches, 1, 2, 0, 3)); // whcn -> cwhn
+        auto [w, h, c, n] = nelements_whcn(m, _p2);
+        tensor patches = image_to_patches(m, x_whcn, w, h);
+        patches = whcn_to_contiguous_2d(m, patches);
         patches = simple_conv(m["ipt_blk3"], patches);
-        _p2 = ggml_concat(m, _p2, patches, 0);
+        _p2 = ggml_concat(m, _p2, patches, channel_dim);
     }
     tensor p2 = basic_decoder_block(m["block2"], _p2);
     tensor p2_gdt = gdt_conv(m["gdt_convs_2"], p2);
@@ -495,15 +502,16 @@ tensor decode(model_ref m, tensor x, swin_result const& features) {
     tensor _p1 = ggml_add_inplace(m, _p2, x1);
 
     {
-        tensor patches = image_to_patches(m, x_whcn, _p1->ne[1], _p1->ne[2]);
-        patches = ggml_cont(m, ggml_permute(m, patches, 1, 2, 0, 3)); // whcn -> cwhn
+        auto [w, h, c, n] = nelements_whcn(m, _p1);
+        tensor patches = image_to_patches(m, x_whcn, w, h);
+        patches = whcn_to_contiguous_2d(m, patches);
         patches = simple_conv(m["ipt_blk2"], patches);
-        _p1 = ggml_concat(m, _p1, patches, 0);
+        _p1 = ggml_concat(m, _p1, patches, channel_dim);
     }
     _p1 = basic_decoder_block(m["block1"], _p1);
     _p1 = upscale_to(m, _p1, x);
     tensor p1_ipt = simple_conv(m["ipt_blk1"], x);
-    _p1 = ggml_concat(m, _p1, p1_ipt, 0);
+    _p1 = ggml_concat(m, _p1, p1_ipt, channel_dim);
 
     tensor p1_out = conv_2d(m["conv_out1.0"], _p1);
     p1_out = ggml_sigmoid_inplace(m, p1_out);
@@ -528,10 +536,10 @@ image_data birefnet_process_input(image_view image, birefnet_params const& p) {
     constexpr f32x4 mean = f32x4{0.485f, 0.456f, 0.406f, 0.f};
     constexpr f32x4 std = f32x4{0.229f, 0.224f, 0.225f, 1.f};
 
-    std::optional<image_data> resized;
-    if (image.extent[0] != p.image_size || image.extent[1] != p.image_size) {
-        resized = image_scale(image, i32x2{p.image_size, p.image_size});
-        image = image_view(*resized);
+    image_data resized;
+    if (image.extent != p.image_extent) {
+        resized = image_scale(image, p.image_extent);
+        image = image_view(resized);
     }
 
     return image_u8_to_f32(image, image_format::rgb_f32, -mean, 1.f / std);
@@ -540,10 +548,9 @@ image_data birefnet_process_input(image_view image, birefnet_params const& p) {
 image_data birefnet_process_output(
     span<float const> mask_data, i32x2 target_extent, birefnet_params const& p) {
 
-    i32x2 model_extent = {p.image_size, p.image_size};
-    image_view mask_output(model_extent, mask_data);
+    image_view mask_output(p.image_extent, mask_data);
     image_data mask_resized;
-    if (model_extent != target_extent) {
+    if (p.image_extent != target_extent) {
         mask_resized = image_scale(mask_output, target_extent);
         mask_output = mask_resized;
     }
@@ -552,12 +559,13 @@ image_data birefnet_process_output(
 
 birefnet_buffers birefnet_precompute(model_ref m, birefnet_params const& params) {
     int w = params.encoder.window_size;
-    int res = params.image_size / 4;
+    int width = params.image_extent[0] / 4;
+    int height = params.image_extent[1] / 4;
 
     birefnet_buffers b;
     b[0] = birefnet::create_relative_position_index(m, w);
     for (int i = 0; i < swin_params::n_layers + 1; ++i) {
-        b[i + 1] = birefnet::create_attention_mask(m, res >> i, res >> i, w);
+        b[i + 1] = birefnet::create_attention_mask(m, width >> i, height >> i, w);
     }
     return b;
 }
@@ -584,24 +592,37 @@ const swin_params swin_l_params = {
         swin_layer_t{2,    48,       192 * 8,     false}}};
 // clang-format on
 
-swin_params swin_detect_params(model_ref m) {
-    tensor t = m.find("bb.layers.0.blocks.0.attn.proj.bias");
-    if (t == nullptr) {
-        throw except("Failed to detect model parameters");
-    }
-    if (t->ne[0] == 96) {
+swin_params swin_detect_params(model_file const& f) {
+    int embed_dim = f.get_int("swin.embed_dim");
+    if (embed_dim == 96) {
         return swin_t_params;
-    } else if (t->ne[0] == 192) {
+    } else if (embed_dim == 192) {
         return swin_l_params;
     } else {
-        throw except("Unsupported Swin Transformer embed dim: {}", t->ne[0]);
+        throw except("Unsupported Swin Transformer embed dim: {}", embed_dim);
     }
 }
 
-birefnet_params birefnet_detect_params(model_ref m) {
+i32x2 birefnet_image_extent(i32x2 input_extent, birefnet_params const& p) {
+    i32x2 extent{p.image_size, p.image_size};
+    if (p.image_size == -1) {
+        ASSERT(input_extent[0] > 0 && input_extent[1] > 0);
+        extent = {
+            next_multiple(input_extent[0], p.image_multiple),
+            next_multiple(input_extent[1], p.image_multiple)};
+    }
+    return extent;
+}
+
+birefnet_params birefnet_detect_params(model_file const& f, i32x2 dynamic_extent) {
+    if (std::string_view arch = f.arch(); arch != "birefnet") {
+        throw except("Architecture expected to be 'birefnet', but was '{}' ({})", arch, f.path);
+    }
     birefnet_params p;
-    p.image_size = 1024; // TODO: support 2K models
-    p.encoder = swin_detect_params(m);
+    p.image_size = f.get_int("birefnet.image_size");
+    p.image_multiple = f.get_int("birefnet.image_multiple");
+    p.image_extent = birefnet_image_extent(dynamic_extent, p);
+    p.encoder = swin_detect_params(f);
     return p;
 }
 
diff --git a/src/visp/arch/birefnet.hpp b/src/visp/arch/birefnet.h
similarity index 97%
rename from src/visp/arch/birefnet.hpp
rename to src/visp/arch/birefnet.h
index 9cbbda2..7f109ad 100644
--- a/src/visp/arch/birefnet.hpp
+++ b/src/visp/arch/birefnet.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "visp/ml.hpp"
-#include "visp/image.hpp"
+#include "visp/ml.h"
+#include "visp/image.h"
 
 #include <array>
 
diff --git a/src/visp/arch/esrgan.cpp b/src/visp/arch/esrgan.cpp
index fdf48e0..a10deb7 100644
--- a/src/visp/arch/esrgan.cpp
+++ b/src/visp/arch/esrgan.cpp
@@ -1,7 +1,7 @@
-#include "visp/arch/esrgan.hpp"
-#include "visp/nn.hpp"
-#include "visp/vision.hpp"
-#include "util/string.hpp"
+#include "visp/arch/esrgan.h"
+#include "util/string.h"
+#include "visp/nn.h"
+#include "visp/vision.h"
 
 #include <charconv>
 #include <string_view>
@@ -10,8 +10,8 @@ namespace visp {
 namespace esrgan {
 
 tensor upsample(model_ref m, tensor x) {
-    auto [c, w, h, n] = nelements(x);
-    x = ggml_interpolate(m, x, int(c), int(w * 2), int(h * 2), int(n), GGML_SCALE_MODE_NEAREST);
+    auto [w, h, c, n] = nelements_whcn(m, x);
+    x = interpolate(m, x, {w * 2, h * 2}, GGML_SCALE_MODE_NEAREST);
     x = conv_2d(m, x, 1, 1);
     x = ggml_leaky_relu(m, x, 0.2f, true);
     return named(m, x);
@@ -24,14 +24,15 @@ tensor conv_block(model_ref m, tensor x) {
 }
 
 tensor risidual_dense_block(model_ref m, tensor x) {
+    int dim = (m.flags & model_build_flag::cwhn) ? 0 : 2;
     tensor x1 = conv_block(m["conv1"], x);
-    tensor c1 = concat(m, {x, x1}, 0);
+    tensor c1 = concat(m, {x, x1}, dim);
     tensor x2 = conv_block(m["conv2"], c1);
-    tensor c2 = concat(m, {c1, x2}, 0);
+    tensor c2 = concat(m, {c1, x2}, dim);
     tensor x3 = conv_block(m["conv3"], c2);
-    tensor c3 = concat(m, {c2, x3}, 0);
+    tensor c3 = concat(m, {c2, x3}, dim);
     tensor x4 = conv_block(m["conv4"], c3);
-    tensor c4 = concat(m, {c3, x4}, 0);
+    tensor c4 = concat(m, {c3, x4}, dim);
     tensor x5 = conv_2d(m["conv5.0"], c4, 1, 1);
     x5 = ggml_scale_inplace(m, x5, 0.2f);
     x = ggml_add(m, x, x5);
@@ -52,6 +53,7 @@ tensor rrdb(model_ref m, tensor x) {
 
 tensor esrgan_generate(model_ref m, tensor x, esrgan_params const& p) {
     m = m["model"];
+    x = cwhn_to_contiguous_2d(m, x);
     x = conv_2d(m[0], x, 1, 1);
 
     tensor sub = x;
@@ -71,41 +73,23 @@ tensor esrgan_generate(model_ref m, tensor x, esrgan_params const& p) {
     x = ggml_leaky_relu(m, x, 0.2f, true);
     x = conv_2d(m[seq + 2], x, 1, 1);
 
+    x = contiguous_2d_to_cwhn(m, x);
     return compute_graph_output(m, x, "result");
 }
 
-esrgan_params esrgan_detect_params(model_ref m) {
-    esrgan_params p;
-    p.n_blocks = 0;
-    int model_len = 0;
-
-    ggml_context* ctx = m.weights_context;
-    for (tensor t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
-        auto name = std::string_view(ggml_get_name(t));
-        if (name.starts_with("model.")) {
-            name = name.substr(6);
-            int x = 0;
-            auto r = std::from_chars(name.data(), name.data() + 2, x);
-            model_len = std::max(model_len, x + 1);
-
-            size_t i_dot = name.find('.');
-            if (i_dot == std::string_view::npos) {
-                continue;
-            }
-            name = name.substr(i_dot + 1, 11);
-            if (name.starts_with("sub.") && (name.ends_with("RDB1") || name.ends_with("RDB1."))) {
-                r = std::from_chars(name.data() + 4, name.data() + 6, x);
-                p.n_blocks = std::max(p.n_blocks, x + 1);
-            }
-        }
+esrgan_params esrgan_detect_params(model_file const& f) {
+    if (std::string_view arch = f.arch(); arch != "esrgan") {
+        throw except("Architecture expected to be 'esrgan', but was '{}' ({})", arch, f.path);
     }
-    // 3 layers per upscale block, each upscales x2, 5 blocks for the rest of the model
-    p.scale = 1 << ((model_len - 5) / 3);
+    esrgan_params p;
+    p.scale = f.get_int("esrgan.scale");
+    p.n_blocks = f.get_int("esrgan.block_count");
+    
     if (p.scale < 2 || p.scale > 4) {
-        throw except("Unsupported scale: {}", p.scale);
+        throw except("ESRGAN: unsupported scale: {}", p.scale);
     }
     if (p.n_blocks < 1 || p.n_blocks > 23) {
-        throw except("Invalid number of blocks: {}", p.n_blocks);
+        throw except("ESRGAN: invalid number of blocks: {}", p.n_blocks);
     }
     return p;
 }
diff --git a/src/visp/arch/esrgan.hpp b/src/visp/arch/esrgan.h
similarity index 91%
rename from src/visp/arch/esrgan.hpp
rename to src/visp/arch/esrgan.h
index 2cfe32a..b5a7b7b 100644
--- a/src/visp/arch/esrgan.hpp
+++ b/src/visp/arch/esrgan.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "visp/ml.hpp"
+#include "visp/ml.h"
 
 namespace visp::esrgan {
 
diff --git a/src/visp/arch/migan.cpp b/src/visp/arch/migan.cpp
index 5371650..67f1b70 100644
--- a/src/visp/arch/migan.cpp
+++ b/src/visp/arch/migan.cpp
@@ -1,9 +1,9 @@
-#include "visp/arch/migan.hpp"
-#include "visp/image-impl.hpp"
-#include "visp/nn.hpp"
-#include "visp/vision.hpp"
-#include "util/math.hpp"
-#include "util/string.hpp"
+#include "visp/arch/migan.h"
+#include "util/math.h"
+#include "util/string.h"
+#include "visp/image-impl.h"
+#include "visp/nn.h"
+#include "visp/vision.h"
 
 #include <array>
 #include <cmath>
@@ -30,20 +30,30 @@ tensor downsample_2d(model_ref m, tensor x) {
 }
 
 tensor upsample_2d(model_ref m, tensor x) {
-    tensor filter_const = m.weights("filter_const");
-    filter_const = ggml_reshape_4d(m, filter_const, 1, filter_const->ne[0], filter_const->ne[1], 1);
+    tensor filter = m.weights("filter_const");
+    if (m.flags & model_build_flag::cwhn) {
+        filter = ggml_reshape_4d(m, filter, 1, filter->ne[0], filter->ne[1], 1);
+    }
 
-    auto [c, w, h, b] = nelements(x);
-    x = ggml_interpolate(m, x, int(c), int(w * 2), int(h * 2), int(b), GGML_SCALE_MODE_NEAREST);
-    x = ggml_mul_inplace(m, x, filter_const);
+    auto [w, h, c, n] = nelements_whcn(m, x);
+    x = interpolate(m, x, {w * 2, h * 2}, GGML_SCALE_MODE_NEAREST);
+    x = ggml_mul_inplace(m, x, filter);
     x = conv_2d_depthwise(m["filter"], x, 1, 2); // 4x4 filter
-    x = slice(m, x, {}, {0, -1}, {0, -1}, {});   // remove padding from right and bottom
+
+    // remove padding from right and bottom
+    if (m.flags & model_build_flag::cwhn) {
+        x = slice(m, x, {}, {0, -1}, {0, -1}, {});
+    } else {
+        x = slice(m, x, {0, -1}, {0, -1}, {}, {});
+    }
     x = ggml_cont(m, x); // required by subsequent ggml_scale for some reason
     return named(m, x);
 }
 
 tensor separable_conv_2d(model_ref m, tensor x, flags<conv> flags) {
-    int pad = int(m["conv1"].weights("weight")->ne[2] / 2);
+    int kdim = (m.flags & model_build_flag::cwhn) ? 2 : 0; // to get kernel size
+    int pad = int(m["conv1"].weights("weight")->ne[kdim] / 2);
+
     x = conv_2d_depthwise(m["conv1"], x, 1, pad);
     if (flags & conv::activation) {
         x = lrelu_agc(m, x, 0.2f, sqrt2, 256);
@@ -60,7 +70,9 @@ tensor separable_conv_2d(model_ref m, tensor x, flags<conv> flags) {
     if (flags & conv::noise) {
         tensor noise = m.weights("noise_const");
         noise = ggml_mul_inplace(m, noise, m.weights("noise_strength"));
-        noise = ggml_reshape_4d(m, noise, 1, noise->ne[0], noise->ne[1], 1);
+        if (m.flags & model_build_flag::cwhn) {
+            noise = ggml_reshape_4d(m, noise, 1, noise->ne[0], noise->ne[1], 1);
+        }
         x = ggml_add_inplace(m, x, noise);
     }
     if (flags & conv::activation) {
@@ -70,6 +82,7 @@ tensor separable_conv_2d(model_ref m, tensor x, flags<conv> flags) {
 }
 
 tensor from_rgb(model_ref m, tensor x) {
+    x = cwhn_to_contiguous_2d(m, x);
     x = conv_2d(m["fromrgb"], x);
     x = lrelu_agc(m, x, 0.2f, sqrt2, 256);
     return named(m, x);
@@ -122,6 +135,7 @@ tensor synthesis(model_ref m, tensor x_in, Features feats, int res) {
         model_ref block = m[format<tensor_name>("b{}", res >> i)];
         std::tie(x, img) = synthesis_block(block, x, feats[i], img, conv::upsample, conv::noise);
     }
+    img = contiguous_2d_to_cwhn(m, img);
     return img;
 }
 
@@ -150,14 +164,13 @@ tensor migan_generate(model_ref m, tensor image, migan_params const& p) {
     return compute_graph_output(m, result);
 }
 
-migan_params migan_detect_params(model_ref m) {
-    if (m.find("encoder.b512.fromrgb.weight") != nullptr) {
-        return migan_params{512};
-    } else if (m.find("encoder.b256.fromrgb.weight") != nullptr) {
-        return migan_params{256};
-    } else {
-        throw std::runtime_error("Failed to detect model parameters");
+migan_params migan_detect_params(model_file const& f) {
+    if (std::string_view arch = f.arch(); arch != "migan") {
+        throw except("Architecture expected to be 'migan', but was '{}' ({})", arch, f.path);
     }
+    migan_params p;
+    p.resolution = f.get_int("migan.image_size");
+    return p;
 }
 
 image_data migan_process_input(image_view image, image_view mask, migan_params const& p) {
@@ -182,7 +195,7 @@ image_data migan_process_input(image_view image, image_view mask, migan_params c
 }
 
 image_data migan_process_output(std::span<float const> data, i32x2 extent, migan_params const& p) {
-    i32x2 model_extent = {p.resolution,p.resolution};
+    i32x2 model_extent = {p.resolution, p.resolution};
     image_view image(model_extent, image_format::rgb_f32, data.data());
     image_data resized;
     if (model_extent != extent) {
diff --git a/src/visp/arch/migan.hpp b/src/visp/arch/migan.h
similarity index 93%
rename from src/visp/arch/migan.hpp
rename to src/visp/arch/migan.h
index d7bec80..e2ccd51 100644
--- a/src/visp/arch/migan.hpp
+++ b/src/visp/arch/migan.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include "visp/image.hpp"
-#include "visp/ml.hpp"
-#include "visp/util.hpp"
+#include "visp/image.h"
+#include "visp/ml.h"
+#include "visp/util.h"
 
 #include <array>
 #include <cmath>
diff --git a/src/visp/arch/mobile-sam.cpp b/src/visp/arch/mobile-sam.cpp
index eb0da1f..7beaef4 100644
--- a/src/visp/arch/mobile-sam.cpp
+++ b/src/visp/arch/mobile-sam.cpp
@@ -1,8 +1,8 @@
-#include "visp/arch/mobile-sam.hpp"
-#include "visp/nn.hpp"
-#include "visp/vision.hpp"
-#include "util/math.hpp"
-#include "util/string.hpp"
+#include "visp/arch/mobile-sam.h"
+#include "visp/nn.h"
+#include "visp/vision.h"
+#include "util/math.h"
+#include "util/string.h"
 
 #include <ggml.h>
 
@@ -23,17 +23,16 @@ tensor conv_2d_depthwise_batch_norm(model_ref m, tensor x, int stride = 1, int p
 }
 
 tensor window_partition(model_ref m, tensor x, int window) {
-    int64_t c = x->ne[0];
-    int64_t b = x->ne[3];
+    auto [c, w, h, b] = nelements(x);
     if (m.flags & model_build_flag::window_partition) {
         x = ggml_win_part(m, x, window);
         x = ggml_reshape_3d(m, x, c, window * window, x->ne[3]);
         return x;
     }
-    int64_t px = (window - x->ne[1] % window) % window;
-    int64_t py = (window - x->ne[2] % window) % window;
-    int64_t npw = (x->ne[1] + px) / window;
-    int64_t nph = (x->ne[2] + py) / window;
+    int64_t px = (window - w % window) % window;
+    int64_t py = (window - h % window) % window;
+    int64_t npw = (w + px) / window;
+    int64_t nph = (h + py) / window;
 
     if (px > 0 || py > 0) {
         x = ggml_pad(m, x, 0, int(px), int(py), 0);
@@ -93,21 +92,24 @@ tensor mb_conv(model_ref m, tensor x) {
     return named(m, x);
 }
 
-tensor patch_merging(model_ref m, tensor x, int input_resolution) {
-    if (x->ne[2] == 1) {
-        x = ggml_reshape_4d(m, x, x->ne[0], input_resolution, input_resolution, x->ne[3]);
-    }
+tensor patch_merging(model_ref m, tensor x) {
     x = conv_2d_batch_norm(m["conv1"], x);
     x = ggml_gelu_inplace(m, x);
 
-    int c_out = int(m.weights("conv2.c.weight")->ne[0]);
+    int c_out_dim = is_cwhn(m) ? 0 : 3;
+    int c_out = int(m.weights("conv2.c.weight")->ne[c_out_dim]);
     int stride = (c_out == 320 || c_out == 448 || c_out == 576) ? 1 : 2;
     x = conv_2d_depthwise_batch_norm(m["conv2"], x, stride, 1);
     x = ggml_gelu_inplace(m, x);
 
-    auto [c, h, w, b] = nelements(x);
+    auto [w, h, c, b] = nelements_whcn(m, x);
     x = conv_2d_batch_norm(m["conv3"], x);
-    x = ggml_reshape_3d(m, x, c, w * h, b);
+    if (is_whcn(m)) {
+        x = ggml_reshape_3d(m, x, w * h, c, b);
+        x = ggml_cont(m, ggml_permute(m, x, 1, 0, 2, 3));
+    } else {
+        x = ggml_reshape_3d(m, x, c, w * h, b);
+    } // -> always [c, wh, b]
     return named(m, x);
 }
 
@@ -175,8 +177,10 @@ tensor tiny_vit_block(
     x = ggml_reshape_3d(m, x, c, spatial, b);
     x = ggml_add_inplace(m, x, res_x);
 
+    model_ref local_conv = m["local_conv"];
+    local_conv.flags |= model_build_flag::cwhn;
     x = ggml_reshape_4d(m, x, c, w, h, b);
-    x = conv_2d_depthwise_batch_norm(m["local_conv"], x, 1, 1);
+    x = conv_2d_depthwise_batch_norm(local_conv, x, 1, 1);
     x = ggml_reshape_3d(m, x, c, spatial, b);
 
     tensor x_mlp = mlp(m["mlp"], x);
@@ -189,7 +193,7 @@ tensor conv_layer(model_ref m, tensor x, tiny_vit_params::layer p) {
     for (int i = 0; i < p.depth; ++i) {
         x = mb_conv(block[i], x);
     }
-    x = patch_merging(m["downsample"], x, p.resolution);
+    x = patch_merging(m["downsample"], x);
     return named(m, x);
 }
 
@@ -199,12 +203,15 @@ tensor basic_layer(model_ref m, tensor x, tiny_vit_params::layer const& p) {
         x = tiny_vit_block(blocks[i], x, p.resolution, p.embed_dim, p.num_heads, p.window_size);
     }
     if (p.downsample) {
-        x = patch_merging(m["downsample"], x, p.resolution);
+        x = ggml_reshape_4d(m, x, x->ne[0], p.resolution, p.resolution, x->ne[2]);
+        x = cwhn_to_contiguous_2d(m, x);
+        x = patch_merging(m["downsample"], x);
     }
     return named(m, x);
 }
 
 tensor tiny_vit(model_ref m, tensor x, tiny_vit_params const& p) {
+    x = cwhn_to_contiguous_2d(m, x);
     x = patch_embed(m["patch_embed"], x);
     x = conv_layer(m["layers.0"], x, p.layers[0]);
 
@@ -216,10 +223,15 @@ tensor tiny_vit(model_ref m, tensor x, tiny_vit_params const& p) {
     x = ggml_reshape_4d(m, x, x->ne[0], 64, 64, x->ne[2]);
 
     // neck
+    x = cwhn_to_contiguous_2d(m, x);
     x = conv_2d(m["neck.0"], x);
+    x = contiguous_2d_to_cwhn(m, x);
     x = layer_norm(m["neck.1"], x);
+    x = cwhn_to_contiguous_2d(m, x);
     x = conv_2d(m["neck.2"], x, 1, 1);
+    x = contiguous_2d_to_cwhn(m, x);
     x = layer_norm(m["neck.3"], x);
+
     return x;
 }
 
@@ -418,6 +430,7 @@ auto two_way_transformer(
 }
 
 tensor upscale_outputs(model_ref m, tensor x) {
+    m.flags |= model_build_flag::cwhn;
     x = conv_transpose_2d(m[0], x, 2);
     x = layer_norm(m[1], x);
     x = ggml_gelu_inplace(m, x);
diff --git a/src/visp/arch/mobile-sam.hpp b/src/visp/arch/mobile-sam.h
similarity index 94%
rename from src/visp/arch/mobile-sam.hpp
rename to src/visp/arch/mobile-sam.h
index f2be4b1..6e38868 100644
--- a/src/visp/arch/mobile-sam.hpp
+++ b/src/visp/arch/mobile-sam.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include "visp/image.hpp"
-#include "visp/ml.hpp"
-#include "visp/vision.hpp"
+#include "visp/image.h"
+#include "visp/ml.h"
+#include "visp/vision.h"
 
 #include <array>
 #include <span>
@@ -41,7 +41,7 @@ float resize_longest_side(i32x2 extent, int target_longest_side);
 
 tensor patch_embed(model_ref m, tensor x);
 tensor mb_conv(model_ref m, tensor x);
-tensor patch_merging(model_ref m, tensor x, int input_resolution);
+tensor patch_merging(model_ref m, tensor x);
 tensor mlp(model_ref m, tensor x);
 tensor attention_rel_bias(model_ref m, tensor x, int dim, int num_heads);
 tensor window_partition(model_ref m, tensor x, int window);
diff --git a/src/visp/image-impl.hpp b/src/visp/image-impl.h
similarity index 98%
rename from src/visp/image-impl.hpp
rename to src/visp/image-impl.h
index 7be76fe..6072fc8 100644
--- a/src/visp/image-impl.hpp
+++ b/src/visp/image-impl.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include "util/math.hpp"
-#include "util/string.hpp"
-#include "visp/image.hpp"
+#include "util/math.h"
+#include "util/string.h"
+#include "visp/image.h"
 
 #include <algorithm>
 #include <array>
diff --git a/src/visp/image.cpp b/src/visp/image.cpp
index 235c97d..364723e 100644
--- a/src/visp/image.cpp
+++ b/src/visp/image.cpp
@@ -1,7 +1,7 @@
-#include "visp/image.hpp"
-#include "image-impl.hpp"
-#include "util/math.hpp"
-#include "util/string.hpp"
+#include "visp/image.h"
+#include "image-impl.h"
+#include "util/math.h"
+#include "util/string.h"
 
 #include <stb_image.h>
 #include <stb_image_resize.h>
@@ -169,6 +169,10 @@ image_data image_alloc(i32x2 extent, image_format format) {
     return image_data{extent, format, std::unique_ptr<uint8_t[]>(new uint8_t[size])};
 }
 
+void image_clear(image_span const& img) {
+    memset(img.data, 0, n_bytes(img));
+}
+
 image_format image_format_from_channels(int n_channels) {
     switch (n_channels) {
         case 1: return image_format::alpha_u8;
@@ -629,10 +633,13 @@ void tile_merge(
                     coverage[i] = layout.overlap[i];
                 }
             }
-            float norm = float((coverage[0] + 1) * (coverage[1] + 1));
-            float blend = weight > 0 ? weight / norm : 1.0f;
-
-            dst.store(idx, dst.load(idx) + blend * tile.load(idx - beg));
+            f32x4 val = tile.load(idx - beg);
+            if (weight > 0) {
+                float norm = float((coverage[0] + 1) * (coverage[1] + 1));
+                float blend = weight / norm;
+                val = dst.load(idx) + blend * val;
+            }
+            dst.store(idx, val);
         }
     }
 }
diff --git a/src/visp/ml.cpp b/src/visp/ml.cpp
index f75eccb..dbc6a1c 100644
--- a/src/visp/ml.cpp
+++ b/src/visp/ml.cpp
@@ -1,6 +1,6 @@
-#include "visp/ml.hpp"
-#include "visp/platform.hpp"
-#include "util/string.hpp"
+#include "visp/ml.h"
+#include "util/string.h"
+#include "visp/platform.h"
 
 #include <algorithm>
 #include <array>
@@ -33,7 +33,7 @@ bool load_ggml_backends() {
                 auto str = dir.parent_path().u8string();
                 ggml_backend_load_all_from_path((char const*)str.c_str());
             }
-        }                
+        }
         return true;
     }();
     return loaded;
@@ -94,6 +94,13 @@ ggml_type backend_device::preferred_float_type() const {
     return GGML_TYPE_COUNT; // no preference, use float type of model weights
 }
 
+tensor_data_layout backend_device::preferred_layout() const {
+    if (type() == backend_type::cpu) {
+        return tensor_data_layout::cwhn;
+    }
+    return tensor_data_layout::unknown; // no preference, keep model weight layout
+}
+
 size_t backend_device::total_memory() const {
     ggml_backend_dev_t dev = ggml_backend_get_device(handle.get());
     size_t free, total;
@@ -114,115 +121,293 @@ void backend_set_n_threads(backend_device& b, int n_threads) {
 }
 
 //
-// model_weights
+// model_build_flags
 
-bool is_float_type(ggml_type t) {
-    return t != GGML_TYPE_I8 && t != GGML_TYPE_I16 && t != GGML_TYPE_I32 && t != GGML_TYPE_I64;
+model_build_flags backend_default_flags(backend_type type) {
+    using enum model_build_flag;
+    switch (type) {
+        case backend_type::cpu:
+            return conv_2d_direct_cwhn | concat_n | f16_conv_transpose | window_partition;
+        case backend_type::gpu: return {};
+    }
+    return {};
 }
 
-struct float_converter {
-    ggml_type target;
-    ggml_type_traits const* dst_traits = nullptr;
-    std::vector<float> f32_buffer;
-    std::vector<byte> dst_buffer;
+model_build_flags model_get_build_flags(model_file const& file) {
+    fixed_string<64> str;
+    std::string_view arch = file.arch();
+    model_build_flags flags;
 
-    explicit float_converter(ggml_type target_type) : target(target_type) {
-        if (target != GGML_TYPE_COUNT) {
-            dst_traits = ggml_get_type_traits(target_type);
+    int64_t key = gguf_find_key(file.gguf.get(), format(str, "{}.tensor_data_layout", arch));
+    if (key != -1) {
+        std::string_view layout = gguf_get_val_str(file.gguf.get(), key);
+        if (layout == "cwhn") {
+            flags |= model_build_flag::cwhn;
         }
     }
+    return flags;
+}
 
-    ggml_type target_type(ggml_tensor const* t) const {
-        if (target == GGML_TYPE_COUNT || !is_float_type(t->type)) {
-            return t->type;
-        }
-        return target;
+//
+// model_file
+
+model_file model_load(char const* filepath) {
+    ggml_context* data_ctx;
+    gguf_init_params params;
+    params.no_alloc = false;
+    params.ctx = &data_ctx;
+
+    gguf_context_ptr gguf_ctx(gguf_init_from_file(filepath, params));
+    if (!gguf_ctx) {
+        throw except("Failed to load GGUF model: {}", filepath);
     }
+    return model_file{std::move(gguf_ctx), ggml_context_ptr(data_ctx), filepath};
+}
 
-    void const* operator()(ggml_tensor const* src, ggml_tensor const* dst) {
-        if (target == GGML_TYPE_COUNT || src->type == dst->type) {
-            return src->data;
-        }
-        ASSERT(dst->type == target);
+int64_t model_file::n_tensors() const {
+    return gguf_get_n_tensors(gguf.get());
+}
 
-        float const* f32_data = reinterpret_cast<float const*>(src->data);
-        if (src->type != GGML_TYPE_F32) {
-            if (int64_t(f32_buffer.size()) < ggml_nelements(src)) {
-                f32_buffer.resize(ggml_nelements(src));
-            }
-            ggml_type_traits const* src_traits = ggml_get_type_traits(src->type);
-            src_traits->to_float(src->data, f32_buffer.data(), ggml_nelements(src));
-            f32_data = f32_buffer.data();
-        }
-        void const* dst_data = f32_data;
-        if (target != GGML_TYPE_F32) {
-            if (dst_buffer.size() < ggml_nbytes(dst)) {
-                dst_buffer.resize(ggml_nbytes(dst));
-            }
-            dst_traits->from_float_ref(f32_data, dst_buffer.data(), ggml_nelements(dst));
-            dst_data = dst_buffer.data();
+int64_t model_file::key(char const* name) const {
+    int64_t key_id = gguf_find_key(gguf.get(), name);
+    if (key_id == -1) {
+        throw except("Can't find key '{}' in model file {}", name, path);
+    }
+    return key_id;
+}
+
+std::string_view model_file::get_string(char const* key_name) const {
+    return gguf_get_val_str(gguf.get(), key(key_name));
+}
+
+int model_file::get_int(char const* key_name) const {
+    return gguf_get_val_i32(gguf.get(), key(key_name));
+}
+
+std::string_view model_file::arch() const {
+    return get_string("general.architecture");
+}
+
+tensor_data_layout model_file::tensor_layout() const {
+    fixed_string<64> str;
+    int64_t key = gguf_find_key(gguf.get(), format(str, "{}.tensor_data_layout", arch()));
+    if (key != -1) {
+        std::string_view layout = gguf_get_val_str(gguf.get(), key);
+        if (layout == "cwhn") {
+            return tensor_data_layout::cwhn;
+        } else if (layout == "whcn") {
+            return tensor_data_layout::whcn;
         }
-        return dst_data;
     }
-};
+    return tensor_data_layout::unknown;
+}
 
-model_weights model_init(backend_device const& be, size_t size) {
+//
+// model_weights
+
+model_weights model_init(size_t size) {
     ggml_init_params params{};
     params.mem_size = size * ggml_tensor_overhead();
     params.no_alloc = true;
     ggml_context_ptr ctx(ggml_init(params));
 
-    return model_weights{std::move(ctx), be.type(), {}, {}};
+    model_weights w{};
+    w.context = std::move(ctx);
+    w.buffer_type = backend_type::cpu;
+    return w;
 }
 
-model_weights model_load(char const* filepath, backend_device const& backend, model_load_params p) {
+bool model_allocate(model_weights& m, backend_device const& b) {
+    ggml_backend_buffer_ptr buffer(ggml_backend_alloc_ctx_tensors(m.context.get(), b.handle.get()));
+    if (!buffer) {
+        return false; // context contains nothing to allocate
+    }
+    m.buffer_type = b.type();
+    m.extra_buffers.push_back(std::move(buffer));
+    return true;
+}
 
-    ggml_context* data_ctx;
-    gguf_init_params params;
-    params.no_alloc = false;
-    params.ctx = &data_ctx;
+namespace {
 
-    gguf_context_ptr gguf_ctx(gguf_init_from_file(filepath, params));
-    if (!gguf_ctx) {
-        throw std::runtime_error("Failed to load GGUF model");
-    }
-    ggml_context_ptr data_ctx_ptr(data_ctx);
-    int64_t n_weights = gguf_get_n_tensors(gguf_ctx.get());
-
-    ggml_init_params model_ctx_params{};
-    model_ctx_params.mem_size = (n_weights + p.n_extra_tensors) * ggml_tensor_overhead();
-    model_ctx_params.no_alloc = true;
-    ggml_context_ptr model_ctx(ggml_init(model_ctx_params));
-
-    float_converter convert(p.float_type);
-    for (int64_t i = 0; i < gguf_get_n_tensors(gguf_ctx.get()); ++i) {
-        auto name = gguf_get_tensor_name(gguf_ctx.get(), i);
-        tensor orig = ggml_get_tensor(data_ctx, name);
-        tensor dup = ggml_new_tensor(
-            model_ctx.get(), convert.target_type(orig), GGML_MAX_DIMS, orig->ne);
-        ggml_set_name(dup, name);
+bool is_float_type(ggml_type t) {
+    return t != GGML_TYPE_I8 && t != GGML_TYPE_I16 && t != GGML_TYPE_I32 && t != GGML_TYPE_I64;
+}
+
+int64_t max_tensor_elements(ggml_context* ctx) {
+    int64_t result = 0;
+    for (ggml_tensor* t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
+        result = std::max(result, ggml_nelements(t));
     }
+    return result;
+}
 
-    ggml_backend_buffer_ptr buffer(ggml_backend_alloc_ctx_tensors(model_ctx.get(), backend));
+ggml_type detect_float_type(ggml_context* ctx) {
+    for (ggml_tensor* t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
+        if (is_float_type(t->type)) {
+            return t->type;
+        }
+    }
+    return GGML_TYPE_F32;
+}
 
-    for (ggml_tensor* t = ggml_get_first_tensor(model_ctx.get()); t != nullptr;
-         t = ggml_get_next_tensor(model_ctx.get(), t)) {
-        tensor data_tensor = ggml_get_tensor(data_ctx, ggml_get_name(t));
-        void const* data = convert(data_tensor, t);
-        ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t));
+template <typename T>
+void permute_whcn_to_cwhn(T* n, bool depthwise) {
+    if (depthwise) { // wh1c -> c1wh
+        T perm[] = {n[3], n[2], n[0], n[1]};
+        std::copy(perm, perm + 4, n);
+    } else {
+        std::swap(n[0], n[2]); // -> chwn
+        std::swap(n[1], n[2]); // -> cwhn
     }
-    return model_weights{std::move(model_ctx), backend.type(), std::move(buffer), {}};
 }
 
-bool model_allocate(model_weights& m, backend_device const& b) {
-    ASSERT(m.buffer_type == b.type(), "Model weights must all be on the same backend");
+struct tensor_converter {
+    ggml_type src_type;
+    ggml_type dst_type;
+    ggml_backend_ptr backend;
+    ggml_context_ptr ctx;
+    ggml_cgraph* graph;
+    ggml_gallocr_ptr gallocr;
+    ggml_tensor convert_src{};
+    ggml_tensor* convert_dst;
 
-    ggml_backend_buffer_ptr buffer(ggml_backend_alloc_ctx_tensors(m.context.get(), b.handle.get()));
-    if (!buffer) {
-        return false; // context contains nothing to allocate
+    tensor_converter(ggml_context* weights, ggml_type target_type, bool whcn_to_cwhn)
+        : dst_type(target_type) {
+
+        if (dst_type == GGML_TYPE_COUNT && !whcn_to_cwhn) {
+            return;
+        }
+        src_type = detect_float_type(weights);
+        if (src_type == dst_type && !whcn_to_cwhn) {
+            return;
+        }
+        if (dst_type == GGML_TYPE_COUNT) {
+            dst_type = src_type;
+        }
+
+        ggml_init_params ctx_params{
+            .mem_size = ggml_tensor_overhead() + ggml_graph_overhead(),
+            .mem_buffer = nullptr,
+            .no_alloc = true};
+        ctx.reset(ggml_init(ctx_params));
+
+        size_t max_elem = max_tensor_elements(weights);
+        graph = ggml_new_graph_custom(ctx.get(), 2, false);
+        convert_src.type = src_type;
+        convert_src.ne[0] = max_elem;
+        convert_src.nb[0] = ggml_type_size(src_type);
+        for (int i = 1; i < GGML_MAX_DIMS; ++i) {
+            convert_src.ne[i] = 1;
+            convert_src.nb[i] = convert_src.nb[i - 1] * convert_src.ne[i - 1];
+        }
+        convert_dst = ggml_cast(ctx.get(), &convert_src, dst_type);
+        ggml_set_output(convert_dst);
+        ggml_build_forward_expand(graph, convert_dst);
+
+        gallocr.reset(ggml_gallocr_new(ggml_backend_cpu_buffer_type()));
+        ggml_gallocr_reserve(gallocr.get(), graph);
+
+        backend.reset(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr));
+    }
+
+    ggml_type target_type(ggml_tensor const* t) const {
+        if (dst_type == GGML_TYPE_COUNT || !is_float_type(t->type)) {
+            return t->type;
+        }
+        return dst_type;
+    }
+
+    void const* operator()(ggml_tensor const* src, ggml_tensor const* dst, bool whcn_to_cwhn) {
+        bool need_type_conv = is_float_type(src->type) && src->type != dst_type;
+        if (dst_type == GGML_TYPE_COUNT || !(need_type_conv || whcn_to_cwhn)) {
+            return src->data;
+        }
+        ASSERT(ctx, "Weights contain tensors that would require conversion");
+
+        convert_src.type = src->type;
+        convert_src.data = src->data;
+        std::copy(src->ne, src->ne + GGML_MAX_DIMS, convert_src.ne);
+        std::copy(src->nb, src->nb + GGML_MAX_DIMS, convert_src.nb);
+        if (whcn_to_cwhn) {
+            bool depthwise = convert_src.ne[2] == 1;
+            permute_whcn_to_cwhn(convert_src.ne, depthwise);
+            permute_whcn_to_cwhn(convert_src.nb, depthwise);
+        }
+
+        ASSERT(convert_dst->type == dst->type);
+        std::copy(dst->ne, dst->ne + GGML_MAX_DIMS, convert_dst->ne);
+        std::copy(dst->nb, dst->nb + GGML_MAX_DIMS, convert_dst->nb);
+
+        bool alloc_ok = ggml_gallocr_alloc_graph(gallocr.get(), graph);
+        ASSERT(alloc_ok);
+
+        ggml_backend_graph_compute(backend.get(), graph);
+        return convert_dst->data;
+    }
+};
+
+span<int32_t const> find_conv2d_weight_indices(model_file const& f) {
+    gguf_context* gguf = f.gguf.get();
+    auto name = format<fixed_string<64>>("{}.conv2d_weights", f.arch());
+    int64_t key = gguf_find_key(gguf, name.c_str());
+    if (key != -1 && gguf_get_arr_type(gguf, key) == GGUF_TYPE_INT32) {
+        size_t n = gguf_get_arr_n(gguf, key);
+        int32_t const* a = reinterpret_cast<int32_t const*>(gguf_get_arr_data(gguf, key));
+        return span(a, n);
+    }
+    return {};
+}
+
+} // namespace
+
+void model_transfer(
+    model_file const& file,
+    model_weights& weights,
+    backend_device const& device,
+    ggml_type float_type,
+    tensor_data_layout layout) {
+
+    gguf_context* gguf = file.gguf.get();
+    ggml_context* src_ctx = file.data.get();
+    ggml_context* dst_ctx = weights.context.get();
+
+    tensor_data_layout file_layout = file.tensor_layout();
+    bool to_cwhn = file_layout == tensor_data_layout::whcn && layout == tensor_data_layout::cwhn;
+    tensor_converter convert(src_ctx, float_type, to_cwhn);
+    // Try to find a list of tensor indices which are weights of 2D operations
+    span<int32_t const> conv2d_weights = find_conv2d_weight_indices(file);
+
+    for (int64_t i = 0, conv2d_idx = 0; i < gguf_get_n_tensors(gguf); ++i) {
+        auto name = gguf_get_tensor_name(gguf, i);
+        tensor orig = ggml_get_tensor(src_ctx, name); // TODO: don't use name lookup
+        auto ne = nelements(orig);
+        if (to_cwhn && conv2d_idx < ssize(conv2d_weights) && conv2d_weights[conv2d_idx] == i) {
+            permute_whcn_to_cwhn(ne.data(), ne[2] == 1);
+            ++conv2d_idx;
+        }
+        tensor dup = ggml_new_tensor(dst_ctx, convert.target_type(orig), GGML_MAX_DIMS, ne.data());
+        ggml_set_name(dup, name);
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(dst_ctx, device);
+    weights.weights_buffer = ggml_backend_buffer_ptr(buffer);
+    weights.buffer_type = device.type();
+    weights.flags = model_get_build_flags(file);
+    if (to_cwhn) {
+        weights.flags |= model_build_flag::cwhn;
+    }
+
+    ggml_tensor* t = ggml_get_first_tensor(dst_ctx);
+    for (int i = 0, conv2d_idx = 0; t; ++i) {
+        tensor data_tensor = ggml_get_tensor(src_ctx, ggml_get_name(t));
+        bool is_2d = conv2d_idx < int(conv2d_weights.size()) && conv2d_weights[conv2d_idx] == i;
+        if (is_2d) {
+            ++conv2d_idx;
+        }
+        void const* data = convert(data_tensor, t, is_2d && to_cwhn);
+        ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t));
+        t = ggml_get_next_tensor(dst_ctx, t);
     }
-    m.extra_buffers.push_back(std::move(buffer));
-    return true;
 }
 
 ggml_type model_weights::float_type() const {
@@ -266,27 +451,17 @@ void compute(compute_graph const& g, backend_device const& b) {
 //
 // model_ref
 
-model_build_flags default_backend_flags(backend_type type) {
-    using enum model_build_flag;
-    switch (type) {
-        case backend_type::cpu:
-            return cwhn | conv_2d_direct | fused_batch_norm | f16_conv_transpose | window_partition;
-        case backend_type::gpu: return cwhn;
-    }
-    return {};
-}
-
 model_ref::model_ref(model_weights& m)
     : weights_context(m.context.get()),
       graph_context(m.context.get()),
       graph(nullptr),
-      flags(default_backend_flags(m.buffer_type)) {}
+      flags(m.flags | backend_default_flags(m.buffer_type)) {}
 
 model_ref::model_ref(model_weights& m, compute_graph& g)
     : weights_context(m.context.get()),
       graph_context(g.context.get()),
       graph(g.graph),
-      flags(default_backend_flags(m.buffer_type)) {}
+      flags(m.flags | backend_default_flags(m.buffer_type)) {}
 
 model_ref::model_ref(
     ggml_context* weights_context,
@@ -473,8 +648,11 @@ tensor concat(model_ref const& m, std::array<tensor, GGML_MAX_SRC> src, int dim)
 }
 
 tensor interpolate(model_ref const& m, tensor x, i64x2 target, int32_t mode) {
-    return ggml_interpolate(
-        m, x, int(target[0]), int(target[1]), int(x->ne[2]), int(x->ne[3]), mode);
+    if ((m.flags & model_build_flag::cwhn) && mode == GGML_SCALE_MODE_NEAREST) {
+        return ggml_interpolate(m, x, x->ne[0], target[0], target[1], x->ne[3], mode);
+    }
+    // Bilinear interpolation requires WHCN layout!
+    return ggml_interpolate(m, x, target[0], target[1], x->ne[2], x->ne[3], mode);
 }
 
 } // namespace visp
diff --git a/src/visp/nn.cpp b/src/visp/nn.cpp
index 7dc84a2..7b6065b 100644
--- a/src/visp/nn.cpp
+++ b/src/visp/nn.cpp
@@ -1,18 +1,14 @@
-#include "nn.hpp"
-#include "util/string.hpp"
+#include "nn.h"
+#include "util/string.h"
 
 namespace visp {
 
-tensor add_bias(model_ref m, tensor x) {
-    if (tensor bias = m.find("bias")) {
-        x = ggml_add_inplace(m, x, bias);
-    }
-    return x;
-}
 
 tensor linear(model_ref m, tensor x) {
     x = ggml_mul_mat(m, m.weights("weight"), x);
-    x = add_bias(m, x);
+    if (tensor bias = m.find("bias")) {
+        x = ggml_add_inplace(m, x, bias);
+    }
     return x;
 }
 
@@ -31,87 +27,150 @@ tensor permute_whcn_to_cwhn(model_ref m, tensor x) {
     return ggml_permute(m, x, 1, 2, 0, 3);
 }
 
-tensor conv_2d(model_ref m, tensor x, int stride, int pad) {
-    ASSERT(m.flags & model_build_flag::cwhn);
+std::array<int64_t, 4> nelements_whcn(model_ref const& m, tensor t) {
+    auto ne = nelements(t);
+    return (m.flags & model_build_flag::cwhn) ? std::array{ne[1], ne[2], ne[0], ne[3]} : ne;
+}
+
+tensor cwhn_to_contiguous_2d(model_ref m, tensor x) {
+    if (m.flags & model_build_flag::cwhn) {
+        return x; // preferred 2D layout is CWHN too
+    }
+    return ggml_cont(m, permute_cwhn_to_whcn(m, x));
+}
+
+tensor whcn_to_contiguous_2d(model_ref m, tensor x) {
+    if (m.flags & model_build_flag::cwhn) {
+        return ggml_cont(m, permute_whcn_to_cwhn(m, x));
+    }
+    return x;
+}
+
+tensor contiguous_2d_to_cwhn(model_ref m, tensor x) {
+    if (m.flags & model_build_flag::cwhn) {
+        return x; // x is already CWHN
+    }
+    return ggml_cont(m, permute_whcn_to_cwhn(m, x));
+}
+
+tensor contiguous_2d_to_whcn(model_ref m, tensor x) {
+    if (m.flags & model_build_flag::cwhn) {
+        return ggml_cont(m, permute_cwhn_to_whcn(m, x));
+    }
+    return x;
+}
+
+tensor add_bias_2d(model_ref m, tensor x) {
+    if (tensor bias = m.find("bias")) {
+        if (!(m.flags & model_build_flag::cwhn)) {
+            bias = ggml_reshape_4d(m, bias, 1, 1, bias->ne[0], 1);
+        }
+        x = ggml_add_inplace(m, x, bias);
+    }
+    return x;
+}
 
+tensor conv_2d(model_ref m, tensor x, int stride, int pad) {
     tensor weight = m.weights("weight");
-    if (weight->ne[1] == 1 && weight->ne[2] == 1 && stride == 1) {
-        auto [c, w, h, b] = nelements(x);
-        weight = ggml_reshape_2d(m, weight, weight->ne[0], weight->ne[3]);
-        x = ggml_reshape_2d(m, x, x->ne[0], w * h * b);
-        x = ggml_mul_mat(m, weight, x);
-        x = ggml_reshape_4d(m, x, weight->ne[1], w, h, b);
-
-    } else if (m.flags & model_build_flag::conv_2d_direct) {
-        weight = permute_cwhn_to_whcn(m, weight);
-        x = permute_cwhn_to_whcn(m, x);
-        x = ggml_conv_2d(m, weight, x, stride, stride, pad, pad, 1, 1);
-        x = permute_whcn_to_cwhn(m, x);
 
-    } else {
-        x = permute_cwhn_to_whcn(m, x);
-        tensor permuted_weight = permute_cwhn_to_whcn(m, weight);
-        tensor cols = ggml_im2col(
-            m, permuted_weight, x, stride, stride, pad, pad, 1, 1, true, GGML_TYPE_F32);
-        tensor a = ggml_reshape_2d(m, cols, cols->ne[0], cols->ne[1] * cols->ne[2] * cols->ne[3]);
-        tensor b = ggml_reshape_2d(
-            m, weight, weight->ne[0] * weight->ne[1] * weight->ne[2], weight->ne[3]);
-        x = ggml_mul_mat(m, b, a);
-        x = ggml_reshape_4d(m, x, weight->ne[3], cols->ne[1], cols->ne[2], cols->ne[3]);
+    if (m.flags & model_build_flag::cwhn) {
+        if (weight->ne[1] == 1 && weight->ne[2] == 1 && stride == 1) {
+            auto [c, w, h, b] = nelements(x);
+            weight = ggml_reshape_2d(m, weight, weight->ne[0], weight->ne[3]);
+            x = ggml_reshape_2d(m, x, x->ne[0], w * h * b);
+            x = ggml_mul_mat(m, weight, x);
+            x = ggml_reshape_4d(m, x, weight->ne[1], w, h, b);
+
+        } else if (m.flags & model_build_flag::conv_2d_direct_cwhn) { 
+            weight = permute_cwhn_to_whcn(m, weight);
+            x = permute_cwhn_to_whcn(m, x);
+            x = ggml_conv_2d_direct(m, weight, x, stride, stride, pad, pad, 1, 1);
+            x = permute_whcn_to_cwhn(m, x);
+
+        } else {
+            x = permute_cwhn_to_whcn(m, x);
+            tensor permuted_weight = permute_cwhn_to_whcn(m, weight);
+            tensor cols = ggml_im2col(
+                m, permuted_weight, x, stride, stride, pad, pad, 1, 1, true, GGML_TYPE_F32);
+            tensor a = ggml_reshape_2d(
+                m, cols, cols->ne[0], cols->ne[1] * cols->ne[2] * cols->ne[3]);
+            tensor b = ggml_reshape_2d(
+                m, weight, weight->ne[0] * weight->ne[1] * weight->ne[2], weight->ne[3]);
+            x = ggml_mul_mat(m, b, a);
+            x = ggml_reshape_4d(m, x, weight->ne[3], cols->ne[1], cols->ne[2], cols->ne[3]);
+        }
+    } else { // WHCN layout
+        x = ggml_conv_2d_direct(m, weight, x, stride, stride, pad, pad, 1, 1);
     }
-    x = add_bias(m, x);
+    x = add_bias_2d(m, x);
     return x;
 }
 
 tensor conv_2d_depthwise(model_ref m, tensor x, int stride, int pad) {
-    ASSERT(m.flags & model_build_flag::cwhn);
-
-    tensor weight = ggml_permute(m, m.weights("weight"), 3, 2, 0, 1);
-    x = permute_cwhn_to_whcn(m, x);
-    x = ggml_conv_2d_dw_direct(m, weight, x, stride, stride, pad, pad, 1, 1);
-    x = permute_whcn_to_cwhn(m, x);
+    tensor weight = m.weights("weight");
 
-    x = add_bias(m, x);
+    if (m.flags & model_build_flag::cwhn) {
+        weight = ggml_permute(m, weight, 3, 2, 0, 1);
+        x = permute_cwhn_to_whcn(m, x);
+        x = ggml_conv_2d_dw_direct(m, weight, x, stride, stride, pad, pad, 1, 1);
+        x = permute_whcn_to_cwhn(m, x);
+    } else {
+        x = ggml_conv_2d_dw_direct(m, weight, x, stride, stride, pad, pad, 1, 1);
+    }
+    x = add_bias_2d(m, x);
     return x;
 }
 
 tensor conv_transpose_2d(model_ref m, tensor x, int stride) {
-    ASSERT(m.flags & model_build_flag::cwhn);
-
     tensor weight = m.weights("weight");
     if (m.flags & model_build_flag::f16_conv_transpose) {
         // TODO: ggml_conv_transpose_2d_p0 expects fp16 weights (cpu backend)
         weight = ggml_cast(m, weight, GGML_TYPE_F16);
     }
-    x = ggml_cont(m, permute_cwhn_to_whcn(m, x));
+    if (m.flags & model_build_flag::cwhn) {
+        x = ggml_cont(m, permute_cwhn_to_whcn(m, x));
+    }
     x = ggml_conv_transpose_2d_p0(m, weight, x, stride);
-    x = ggml_cont(m, permute_whcn_to_cwhn(m, x));
-    x = add_bias(m, x);
+
+    if (m.flags & model_build_flag::cwhn) {
+        x = ggml_cont(m, permute_whcn_to_cwhn(m, x));
+    }
+    x = add_bias_2d(m, x);
     return x;
 }
 
 tensor conv_2d_deform(
     model_ref m, tensor x, tensor weight, tensor offset, tensor mask, int stride, int pad) {
-    ASSERT(m.flags & model_build_flag::cwhn);
 
-    x = permute_cwhn_to_whcn(m, x);
-    weight = permute_cwhn_to_whcn(m, weight);
-    offset = permute_cwhn_to_whcn(m, offset);
-    if (mask) {
-        mask = permute_cwhn_to_whcn(m, mask);
+    if (m.flags & model_build_flag::cwhn) {
+        x = permute_cwhn_to_whcn(m, x);
+        weight = permute_cwhn_to_whcn(m, weight);
+        offset = permute_cwhn_to_whcn(m, offset);
+        if (mask) {
+            mask = permute_cwhn_to_whcn(m, mask);
+        }
     }
     x = ggml_conv_2d_deform(m, weight, x, offset, mask, stride, stride, pad, pad);
-    x = permute_whcn_to_cwhn(m, x);
+    
+    if (m.flags & model_build_flag::cwhn) {
+        x = permute_whcn_to_cwhn(m, x);
+    }
     return x;
 }
 
 tensor batch_norm_2d(model_ref m, tensor x) {
-    ASSERT(m.flags & model_build_flag::cwhn);
+    // Batch norm is expected to be have been fused into mul+add. See convert.py
     ASSERT(m.find("running_mean") == nullptr, "Batch norm was not fused");
     ASSERT(m.find("running_var") == nullptr, "Batch norm was not fused");
 
-    x = ggml_mul_inplace(m, x, m.weights("weight"));
-    x = ggml_add_inplace(m, x, m.weights("bias"));
+    tensor weight = m.weights("weight");
+    tensor bias = m.weights("bias");
+    if (!(m.flags & model_build_flag::cwhn)) { // WHCN layout
+        weight = ggml_reshape_4d(m, weight, 1, 1, weight->ne[0], 1);
+        bias = ggml_reshape_4d(m, bias, 1, 1, bias->ne[0], 1);
+    }
+    x = ggml_mul_inplace(m, x, weight);
+    x = ggml_add_inplace(m, x, bias);
     return named(m, x);
 }
 
diff --git a/src/visp/nn.h b/src/visp/nn.h
new file mode 100644
index 0000000..eb8c106
--- /dev/null
+++ b/src/visp/nn.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "visp/ml.h"
+#include "visp/util.h"
+
+// Common neural network building blocks
+
+namespace visp {
+
+tensor linear(model_ref, tensor x);
+tensor layer_norm(model_ref, tensor x, float eps = 1e-5f);
+
+// Permute between CWHN and WHCN tensor dimension ordering. Does not rewrite tensor data.
+tensor permute_cwhn_to_whcn(model_ref m, tensor x);
+tensor permute_whcn_to_cwhn(model_ref m, tensor x);
+
+// "Contiguous 2D" refers to the layout configured in `m` model flags, ie. the preferred
+// memory layout for 2D operations like convolution.
+inline bool is_whcn(model_ref m) { return !(m.flags & model_build_flag::cwhn); }
+inline bool is_cwhn(model_ref m) { return !!(m.flags & model_build_flag::cwhn); }
+
+// These functions convert between memory layouts, ie. they rewrite tensor data.
+tensor cwhn_to_contiguous_2d(model_ref m, tensor x);
+tensor whcn_to_contiguous_2d(model_ref m, tensor x);
+tensor contiguous_2d_to_cwhn(model_ref m, tensor x);
+tensor contiguous_2d_to_whcn(model_ref m, tensor x);
+
+// Always returns number of elements of tensor in width-height-channels-batch order,
+// even if that's not how they're stored in memory.
+std::array<int64_t, 4> nelements_whcn(model_ref const&, tensor t);
+
+// 2D (convolution) functions
+// Input and weight are expected to be in "contiguous 2D" layout as configured in `m`.
+tensor conv_2d(model_ref m, tensor x, int stride = 1, int pad = 0);
+tensor conv_2d_depthwise(model_ref m, tensor x, int stride = 1, int pad = 0);
+tensor conv_2d_deform(
+    model_ref m, tensor x, tensor weight, tensor offset, tensor mask, int stride, int pad);
+tensor conv_transpose_2d(model_ref m, tensor x, int stride);
+tensor batch_norm_2d(model_ref, tensor x);
+
+} // namespace visp
diff --git a/src/visp/nn.hpp b/src/visp/nn.hpp
deleted file mode 100644
index 418c45c..0000000
--- a/src/visp/nn.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include "visp/ml.hpp"
-#include "visp/util.hpp"
-
-// Common neural network building blocks
-
-namespace visp {
-
-tensor linear(model_ref, tensor x);
-tensor layer_norm(model_ref, tensor x, float eps = 1e-5f);
-
-// Permute between CWHN and WHCN tensor dimension ordering. Does not rewrite tensor data.
-tensor permute_cwhn_to_whcn(model_ref m, tensor x);
-tensor permute_whcn_to_cwhn(model_ref m, tensor x);
-
-tensor conv_2d(model_ref, tensor x, int stride = 1, int pad = 0);
-tensor conv_2d_depthwise(model_ref, tensor x, int stride = 1, int pad = 0);
-tensor conv_2d_deform(
-    model_ref, tensor x, tensor weight, tensor offset, tensor mask, int stride, int pad);
-tensor conv_transpose_2d(model_ref m, tensor x, int stride);
-
-tensor batch_norm_2d(model_ref, tensor x);
-
-} // namespace visp
diff --git a/src/visp/platform.cpp b/src/visp/platform.cpp
index a398483..eeaa28b 100644
--- a/src/visp/platform.cpp
+++ b/src/visp/platform.cpp
@@ -1,4 +1,4 @@
-#include "visp/platform.hpp"
+#include "visp/platform.h"
 
 #ifdef _WIN32
 #    ifndef WIN32_LEAN_AND_MEAN
diff --git a/src/visp/platform.hpp b/src/visp/platform.h
similarity index 100%
rename from src/visp/platform.hpp
rename to src/visp/platform.h
diff --git a/src/visp/vision.cpp b/src/visp/vision.cpp
index 4d335ef..a743311 100644
--- a/src/visp/vision.cpp
+++ b/src/visp/vision.cpp
@@ -1,38 +1,38 @@
-#include "visp/vision.hpp"
-#include "util/math.hpp"
-#include "util/string.hpp"
+#include "visp/vision.h"
+#include "util/math.h"
+#include "util/string.h"
 
 namespace visp {
 
 //
 // Mobile SAM
 
-sam_model sam_load_model(char const* filepath, backend_device const& backend) {
+sam_model sam_load_model(char const* filepath, backend_device const& dev) {
     sam_model model;
-    model.backend = &backend;
-    model_load_params load_params = {
-        .float_type = backend.preferred_float_type(),
-        .n_extra_tensors = 0,
-    };
-    model.weights = model_load(filepath, backend, load_params);
+    model.backend = &dev;
+    model_file file = model_load(filepath);
     model.params = sam_params{};
-    model.encoder = compute_graph_init();
-
-    model_ref m = model_ref(model.weights, model.encoder);
-    int res = model.params.image_size;
-    model.input_image = compute_graph_input(m, GGML_TYPE_F32, {3, res, res, 1});
-    tensor embeds = sam_encode_image(m, model.input_image, model.params);
-    model.output_embed = compute_graph_output(m, embeds);
-
-    compute_graph_allocate(model.encoder, backend);
+    model.weights = model_init(file.n_tensors());
+    model_transfer(file, model.weights, dev, dev.preferred_float_type(), dev.preferred_layout());
     return model;
 }
 
-void sam_encode(sam_model& m, image_view image) {
-    m.image_extent = image.extent;
-    image_data img_data = sam_process_input(image, m.params);
-    transfer_to_backend(m.input_image, img_data);
-    compute(m.encoder, *m.backend);
+void sam_encode(sam_model& model, image_view image) {
+    if (!model.encoder) {
+        model.encoder = compute_graph_init();
+        model_ref m = model_ref(model.weights, model.encoder);
+
+        int res = model.params.image_size;
+        model.input_image = compute_graph_input(m, GGML_TYPE_F32, {3, res, res, 1});
+        tensor embeds = sam_encode_image(m, model.input_image, model.params);
+        model.output_embed = compute_graph_output(m, embeds);
+        compute_graph_allocate(model.encoder, *model.backend);
+    }
+
+    model.image_extent = image.extent;
+    image_data img_data = sam_process_input(image, model.params);
+    transfer_to_backend(model.input_image, img_data);
+    compute(model.encoder, *model.backend);
 }
 
 image_data sam_compute_impl(sam_model& model, i32x2 point1, i32x2 point2) {
@@ -52,6 +52,7 @@ image_data sam_compute_impl(sam_model& model, i32x2 point1, i32x2 point2) {
 
         compute_graph_allocate(model.decoder, *model.backend);
     }
+
     f32x4 prompt_data = is_point
         ? sam_process_point(point1, model.image_extent, model.params)
         : sam_process_box({point1, point2}, model.image_extent, model.params);
@@ -78,33 +79,33 @@ image_data sam_compute(sam_model& model, box_2d box) {
 //
 // BiRefNet
 
-birefnet_model birefnet_load_model(char const* filepath, backend_device const& backend) {
+birefnet_model birefnet_load_model(char const* filepath, backend_device const& dev) {
     birefnet_model model;
-    model.backend = &backend;
-    model_load_params load_params = {
-        .float_type = backend.preferred_float_type(),
-        .n_extra_tensors = swin_params::n_layers + 2
-    };
-    model.weights = model_load(filepath, backend, load_params);
-    model.params = birefnet_detect_params(model.weights);
-
-    birefnet_buffers buffers = birefnet_precompute(model.weights, model.params);
-    model_allocate(model.weights, backend);
-    for (tensor_data const& buf : buffers) {
-        transfer_to_backend(buf);
-    }
-
-    model.graph = compute_graph_init(6 * 1024);
-    model_ref m(model.weights, model.graph);
-    int res = model.params.image_size;
-    model.input = compute_graph_input(m, GGML_TYPE_F32, {3, res, res, 1});
-    model.output = birefnet_predict(m, model.input, model.params);
-    compute_graph_allocate(model.graph, backend);
-
+    model.backend = &dev;
+    model_file file = model_load(filepath);
+    model.params = birefnet_detect_params(file, {1024, 1024});
+    model.weights = model_init(file.n_tensors());
+    model_transfer(file, model.weights, dev, dev.preferred_float_type(), dev.preferred_layout());
     return model;
 }
 
 image_data birefnet_compute(birefnet_model& model, image_view image) {
+    i32x2 res = birefnet_image_extent(image.extent, model.params);
+    if (!model.input || res != model.params.image_extent) {
+        model.params.image_extent = res;
+        model.graph = compute_graph_init(6 * 1024);
+
+        model_ref m(model.weights, model.graph);
+        birefnet_buffers buffers = birefnet_precompute(m, model.params);
+        model.input = compute_graph_input(m, GGML_TYPE_F32, {3, res[0], res[1], 1});
+        model.output = birefnet_predict(m, model.input, model.params);
+
+        compute_graph_allocate(model.graph, *model.backend);
+        for (tensor_data const& buf : buffers) {
+            transfer_to_backend(buf);
+        }
+    }
+
     image_data img_data = birefnet_process_input(image, model.params);
     transfer_to_backend(model.input, img_data);
 
@@ -117,28 +118,28 @@ image_data birefnet_compute(birefnet_model& model, image_view image) {
 //
 // MI-GAN
 
-migan_model migan_load_model(char const* filepath, backend_device const& backend) {
+migan_model migan_load_model(char const* filepath, backend_device const& dev) {
     migan_model model;
-    model.backend = &backend;
-    model_load_params load_params = {
-        .float_type = backend.preferred_float_type(),
-        .n_extra_tensors = 0
-    };
-    model.weights = model_load(filepath, backend, load_params);
-    model.params = migan_detect_params(model.weights);
+    model.backend = &dev;
+    model_file file = model_load(filepath);
+    model.params = migan_detect_params(file);
     model.params.invert_mask = true; // inpaint opaque areas
-    int res = model.params.resolution;
-
-    model.graph = compute_graph_init();
-    model_ref m(model.weights, model.graph);
-    model.input = compute_graph_input(m, GGML_TYPE_F32, {4, res, res, 1});
-    model.output = migan_generate(m, model.input, model.params);
-    compute_graph_allocate(model.graph, backend);
-
+    model.weights = model_init(file.n_tensors());
+    model_transfer(file, model.weights, dev, dev.preferred_float_type(), dev.preferred_layout());
     return model;
 }
 
 image_data migan_compute(migan_model& model, image_view image, image_view mask) {
+    if (!model.graph) {
+        model.graph = compute_graph_init();
+        model_ref m(model.weights, model.graph);
+
+        int res = model.params.resolution;
+        model.input = compute_graph_input(m, GGML_TYPE_F32, {4, res, res, 1});
+        model.output = migan_generate(m, model.input, model.params);
+        compute_graph_allocate(model.graph, *model.backend);
+    }
+
     image_data input_data = migan_process_input(image, mask, model.params);
     transfer_to_backend(model.input, input_data);
 
@@ -155,15 +156,13 @@ image_data migan_compute(migan_model& model, image_view image, image_view mask)
 
 constexpr int esrgan_default_tile_size = 224;
 
-esrgan_model esrgan_load_model(char const* filepath, backend_device const& b) {
+esrgan_model esrgan_load_model(char const* filepath, backend_device const& dev) {
     esrgan_model model;
-    model.backend = &b;
-    model_load_params load_params = {
-        .float_type = b.preferred_float_type(),
-        .n_extra_tensors = 0
-    };
-    model.weights = model_load(filepath, b, load_params);
-    model.params = esrgan_detect_params(model.weights);
+    model.backend = &dev;
+    model_file file = model_load(filepath);
+    model.params = esrgan_detect_params(file);
+    model.weights = model_init(file.n_tensors());
+    model_transfer(file, model.weights, dev, dev.preferred_float_type(), dev.preferred_layout());
     return model;
 }
 
@@ -185,6 +184,7 @@ image_data esrgan_compute(esrgan_model& model, image_view image) {
     image_data input_tile = image_alloc(tiles.tile_size, image_format::rgb_f32);
     image_data output_tile = image_alloc(tiles_out.tile_size, image_format::rgb_f32);
     image_data output_image = image_alloc(image.extent * model.params.scale, image_format::rgb_f32);
+    image_clear(output_image);
 
     for (int t = 0; t < tiles.total(); ++t) {
         i32x2 tile_coord = tiles.coord(t);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 76aaade..6d06f46 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -6,8 +6,9 @@ target_sources(test-vision PRIVATE
   test-image.cpp
 )
 target_include_directories(test-vision PRIVATE . ../src)
-target_compile_options(test-vision PRIVATE ${VISP_WARNINGS})
-target_compile_definitions(test-vision PRIVATE ${VISP_ASSERT} ${VISP_FMT_DEFS})
+target_compile_definitions(test-vision PRIVATE ${VISP_ASSERT} ${VISP_DEFINITIONS})
+target_compile_options(test-vision PRIVATE ${VISP_WARNINGS} ${VISP_COMP_OPTIONS})
+target_link_options(test-vision PRIVATE ${VISP_LINK_OPTIONS})
 target_link_libraries(test-vision PRIVATE visioncpp ${VISP_FMT_LINK})
 add_test(NAME vision COMMAND test-vision -v)
 
@@ -20,8 +21,9 @@ target_sources(test-models PRIVATE
   test-models.cpp
 )
 target_include_directories(test-models PRIVATE . ../src)
-target_compile_options(test-models PRIVATE ${VISP_WARNINGS})
-target_compile_definitions(test-models PRIVATE ${VISP_ASSERT} ${VISP_FMT_DEFS})
+target_compile_definitions(test-models PRIVATE ${VISP_ASSERT} ${VISP_DEFINITIONS})
+target_compile_options(test-models PRIVATE ${VISP_WARNINGS} ${VISP_COMP_OPTIONS})
+target_link_options(test-models PRIVATE ${VISP_LINK_OPTIONS})
 target_link_libraries(test-models PRIVATE visioncpp ${VISP_FMT_LINK})
 if(VISP_VULKAN AND NOT VISP_CI)
   add_test(NAME models COMMAND test-models -v)
@@ -37,7 +39,9 @@ include(reference-images.cmake)
 
 add_library(vision-workbench workbench.cpp)
 target_include_directories(vision-workbench PRIVATE ../src)
-target_compile_definitions(vision-workbench PRIVATE ${VISP_ASSERT} ${VISP_FMT_DEFS})
+target_compile_definitions(vision-workbench PRIVATE ${VISP_ASSERT} ${VISP_DEFINITIONS})
+target_compile_options(vision-workbench PRIVATE ${VISP_COMP_OPTIONS})
+target_link_options(vision-workbench PRIVATE ${VISP_LINK_OPTIONS})
 target_link_libraries(vision-workbench PRIVATE visioncpp ggml ${VISP_FMT_LINK})
 
 #
@@ -46,6 +50,7 @@ target_link_libraries(vision-workbench PRIVATE visioncpp ggml ${VISP_FMT_LINK})
 add_executable(vision-bench)
 target_sources(vision-bench PRIVATE benchmark.cpp testing.cpp)
 target_include_directories(vision-bench PRIVATE . ../src)
-target_compile_definitions(vision-bench PRIVATE VISP_TEST_NO_MAIN ${VISP_ASSERT} ${VISP_FMT_DEFS})
-target_compile_options(vision-bench PRIVATE ${VISP_WARNINGS})
+target_compile_definitions(vision-bench PRIVATE VISP_TEST_NO_MAIN ${VISP_ASSERT} ${VISP_DEFINITIONS})
+target_compile_options(vision-bench PRIVATE ${VISP_WARNINGS} ${VISP_COMP_OPTIONS})
+target_link_options(vision-bench PRIVATE ${VISP_LINK_OPTIONS})
 target_link_libraries(vision-bench PRIVATE visioncpp ${VISP_FMT_LINK})
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
index 5474a39..a75bd13 100644
--- a/tests/benchmark.cpp
+++ b/tests/benchmark.cpp
@@ -1,8 +1,8 @@
-#include "testing.hpp"
-#include "visp/image.hpp"
-#include "visp/ml.hpp"
-#include "visp/util.hpp"
-#include "visp/vision.hpp"
+#include "testing.h"
+#include "visp/image.h"
+#include "visp/ml.h"
+#include "visp/util.h"
+#include "visp/vision.h"
 
 #include <chrono>
 #include <cmath>
@@ -114,7 +114,7 @@ bench_timings benchmark_esrgan(path model_path, backend_device& backend) {
     image_data input_data = image_u8_to_f32(input, image_format::rgb_f32);
 
     compute_graph graph = compute_graph_init(esrgan_estimate_graph_size(model.params));
-    model_ref m(model.weights, model.graph);
+    model_ref m(model.weights, graph);
     i64x4 input_shape = {3, input.extent[0], input.extent[1], 1};
     model.input = compute_graph_input(m, GGML_TYPE_F32, input_shape);
     model.output = esrgan_generate(m, model.input, model.params);
@@ -259,7 +259,7 @@ int main(int argc, char** argv) {
             line, "| {: <10} | {: <30} | {: <6} | {: >11} | {: >6} |\n", "Arch", "Model", "Device", "Avg", "Dev"));
         printf("|:-----------|:-------------------------------|:-------|------------:|-------:|\n");
         for (const auto& result : results) {
-            auto model = result.model.substr(std::max(int(result.model.length()) - 32, 0));
+            auto model = result.model.substr(std::max(int(result.model.length()) - 30, 0));
             print(format(
                 line, "| {: <10} | {: <30} | {: <6} | {:8.1f} ms | {:6.1f} |\n", result.arch, model,
                 result.backend, result.time.mean, result.time.stdev));
diff --git a/tests/reference-images.cmake b/tests/reference-images.cmake
index ff9b6c2..d2d0a0b 100644
--- a/tests/reference-images.cmake
+++ b/tests/reference-images.cmake
@@ -1,4 +1,5 @@
 file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/birefnet-cpu.png/c8663d4c985f94b29fcca3c3c5d2058c53447f19c521b7c5f97276cace68bb09" "tests/reference/birefnet-cpu.png" EXPECTED_HASH SHA256=c8663d4c985f94b29fcca3c3c5d2058c53447f19c521b7c5f97276cace68bb09)
+file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/birefnet-dynamic.png/720bf20140f6f93c3c3953ed2e28a9cb395def8426f53c031d58a8393784227f" "tests/reference/birefnet-dynamic.png" EXPECTED_HASH SHA256=720bf20140f6f93c3c3953ed2e28a9cb395def8426f53c031d58a8393784227f)
 file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/birefnet-gpu.png/c8663d4c985f94b29fcca3c3c5d2058c53447f19c521b7c5f97276cace68bb09" "tests/reference/birefnet-gpu.png" EXPECTED_HASH SHA256=c8663d4c985f94b29fcca3c3c5d2058c53447f19c521b7c5f97276cace68bb09)
 file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/esrgan-cpu.png/481dcc0eb617feb9f8f7403ce179e77e2eba2c7a067f4a1ea90e0fb47083d814" "tests/reference/esrgan-cpu.png" EXPECTED_HASH SHA256=481dcc0eb617feb9f8f7403ce179e77e2eba2c7a067f4a1ea90e0fb47083d814)
 file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/esrgan-gpu.png/a8bfab0e07aeca16b737872bb3dbbe0e6b76cfff5616d2f02f2b0465cc7a0937" "tests/reference/esrgan-gpu.png" EXPECTED_HASH SHA256=a8bfab0e07aeca16b737872bb3dbbe0e6b76cfff5616d2f02f2b0465cc7a0937)
@@ -7,4 +8,4 @@ file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/migan-gpu
 file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/mobile_sam-box-cpu.png/1a4d1a6a45861c8481e55d215b0d8a57c7fd7cb29c0698fa1fad0e96b59c13e8" "tests/reference/mobile_sam-box-cpu.png" EXPECTED_HASH SHA256=1a4d1a6a45861c8481e55d215b0d8a57c7fd7cb29c0698fa1fad0e96b59c13e8)
 file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/mobile_sam-box-gpu.png/51e1a3ac5ba467152b1858d98d4522f401e0d7104069e915e87df6df5993877c" "tests/reference/mobile_sam-box-gpu.png" EXPECTED_HASH SHA256=51e1a3ac5ba467152b1858d98d4522f401e0d7104069e915e87df6df5993877c)
 file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/mobile_sam-point-cpu.png/1abe24d0d0e5d5a703ab13a1c7dc7e1e24dd4e239dbee54ce70cac3edeccaff3" "tests/reference/mobile_sam-point-cpu.png" EXPECTED_HASH SHA256=1abe24d0d0e5d5a703ab13a1c7dc7e1e24dd4e239dbee54ce70cac3edeccaff3)
-file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/mobile_sam-point-gpu.png/2016069b41f87f6958eaffe66baf686660eae72acaa4cff2febb2d6a8d170912" "tests/reference/mobile_sam-point-gpu.png" EXPECTED_HASH SHA256=2016069b41f87f6958eaffe66baf686660eae72acaa4cff2febb2d6a8d170912)
+file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/mobile_sam-point-gpu.png/2016069b41f87f6958eaffe66baf686660eae72acaa4cff2febb2d6a8d170912" "tests/reference/mobile_sam-point-gpu.png" EXPECTED_HASH SHA256=2016069b41f87f6958eaffe66baf686660eae72acaa4cff2febb2d6a8d170912)
\ No newline at end of file
diff --git a/tests/test-image.cpp b/tests/test-image.cpp
index b3eaa18..25fc10b 100644
--- a/tests/test-image.cpp
+++ b/tests/test-image.cpp
@@ -1,7 +1,7 @@
-#include "testing.hpp"
-#include "visp/image-impl.hpp"
-#include "visp/image.hpp"
-#include "visp/util.hpp"
+#include "testing.h"
+#include "visp/image-impl.h"
+#include "visp/image.h"
+#include "visp/util.h"
 
 #include <array>
 #include <filesystem>
diff --git a/tests/test-models.cpp b/tests/test-models.cpp
index 531c5ac..3f7b803 100644
--- a/tests/test-models.cpp
+++ b/tests/test-models.cpp
@@ -1,7 +1,7 @@
-#include "util/string.hpp"
-#include "visp/vision.hpp"
+#include "util/string.h"
+#include "visp/vision.h"
 
-#include "testing.hpp"
+#include "testing.h"
 
 namespace visp {
 
@@ -47,10 +47,29 @@ VISP_BACKEND_TEST(test_birefnet)(backend_type bt) {
     image_data input = image_load(input_path.string().c_str());
     image_data output = birefnet_compute(model, input);
 
-    float tolerance = bt == backend_type::cpu ? 0.01f : 0.3f; // TODO: GPU is non-deterministic
+    float tolerance = bt == backend_type::cpu ? 0.01f : 0.015f;
     compare_images(name, output, tolerance);
 }
 
+VISP_TEST(test_birefnet_dynamic) {
+    path model_path = test_dir().models / "BiRefNet-dynamic-F16.gguf";
+    if (!exists(model_path) || !backend_is_available(backend_type::gpu)) {
+        throw test_skip{"Model not available"}; // it's a large model
+    }
+    // Test using 2 images with different resolutions one after the other
+    path input_path1 = test_dir().input / "cat-and-hat.jpg";
+    path input_path2 = test_dir().input / "wardrobe.jpg";
+
+    backend_device b = backend_init(backend_type::gpu);
+    birefnet_model model = birefnet_load_model(model_path.string().c_str(), b);
+    image_data input1 = image_load(input_path1.string().c_str());
+    image_data input2 = image_load(input_path2.string().c_str());
+    image_data output1 = birefnet_compute(model, input1);
+    image_data output2 = birefnet_compute(model, input2);
+
+    compare_images("birefnet-dynamic.png", output2, 0.015f);
+}
+
 VISP_BACKEND_TEST(test_migan)(backend_type bt) {
     path model_path = test_dir().models / "MIGAN-512-places2-F16.gguf";
     path image_path = test_dir().input / "bench-image.jpg";
diff --git a/tests/test_birefnet.py b/tests/test_birefnet.py
index 33dcf8c..353bb0d 100644
--- a/tests/test_birefnet.py
+++ b/tests/test_birefnet.py
@@ -15,6 +15,8 @@
 
 torch.set_printoptions(precision=3, linewidth=100, edgeitems=6, sci_mode=False)
 
+nhwc_layout = dict(memory_layout="nhwc")
+
 
 class WindowAttention(nn.Module):
     def __init__(
@@ -741,7 +743,7 @@ def test_encode():
     state.update({f"input{i}": to_nhwc(xs[i]) for i in range(4)})
     state.update({f"input_low{i}": to_nhwc(xs_low[i]) for i in range(4)})
 
-    results = workbench.invoke_test("biref_encode", x, state)
+    results = workbench.invoke_test("biref_encode", x, state, nhwc_layout)
 
     for i, e in enumerate(expected):
         result = to_nchw(results[i])
@@ -755,28 +757,37 @@ def test_encode():
 
 
 @pytest.mark.parametrize("scenario", ["small", "large"])
+@pytest.mark.parametrize("memory_layout", ["nchw", "nhwc"])
 @pytest.mark.parametrize("backend", ["cpu", "vulkan"])
-def test_conv_2d_deform(scenario: str, backend: str):
+def test_conv_2d_deform(scenario: str, memory_layout: str, backend: str):
+    torch.manual_seed(42)
+    if memory_layout == "nhwc" and backend == "vulkan":
+        pytest.skip("conv_2d_deform with nhwc layout is not supported on Vulkan")
+
     w, h, c_in, c_out, k = {
         "small": (4, 4, 5, 2, 3),
-        "large": (42, 38, 82, 32, 3),
+        "large": (49, 38, 81, 17, 7),
     }[scenario]
-    x = input_tensor(1, c_in, h, w)
-    weight = input_tensor(c_out, c_in, k, k)
+    x = torch.rand(1, c_in, h, w) - 0.5
+    weight = torch.rand(c_out, c_in, k, k) - 0.5
     offset = 1.0 - input_tensor(1, 2 * k * k, h, w)
     mask = torch.rand(1, k * k, h, w)
-    expected = torchvision.ops.deform_conv2d(x, offset, weight, mask=mask, padding=(1, 1))
+    expected = torchvision.ops.deform_conv2d(x, offset, weight, mask=mask, padding=(k // 2, k // 2))
 
-    x = to_nhwc(x)
     state = {
-        "weight": to_nhwc(weight),
-        "offset": to_nhwc(offset),
-        "mask": to_nhwc(mask),
+        "weight": weight,
+        "offset": offset,
+        "mask": mask,
     }
-    result = workbench.invoke_test("conv_2d_deform", x, state, backend=backend)
-    result = to_nchw(result)
+    if memory_layout == "nhwc":
+        x = to_nhwc(x)
+        state = {k: to_nhwc(v) for k, v in state.items()}
+    params = dict(memory_layout=memory_layout, padding=k // 2)
+    result = workbench.invoke_test("conv_2d_deform", x, state, params, backend=backend)
+    if memory_layout == "nhwc":
+        result = to_nchw(result)
 
-    assert torch.allclose(result, expected, atol=1e-2 if backend == "vulkan" else 1e-5)
+    assert torch.allclose(result, expected, atol=0.1 if backend == "vulkan" else 0.001)
 
 
 class DeformableConv2d(nn.Module):
@@ -862,7 +873,7 @@ def test_deformable_conv_2d():
     state = convert_to_nhwc(state, key="conv")
     state = {shorten_weight_name(k): v for k, v in state.items()}
     x = to_nhwc(x)
-    result = workbench.invoke_test("biref_deformable_conv_2d", x, state)
+    result = workbench.invoke_test("biref_deformable_conv_2d", x, state, nhwc_layout)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected)
@@ -907,10 +918,8 @@ def test_global_avg_pool(backend: str):
 
     state = fuse_all_conv_2d_batch_norm(state, "", "1", "2")
     state = convert_to_nhwc(state, key="1.weight")
-    for k, v in state.items():
-        print(f"{k}: {v.shape}")
     x = to_nhwc(x)
-    result = workbench.invoke_test("biref_global_avg_pool", x, state, backend=backend)
+    result = workbench.invoke_test("biref_global_avg_pool", x, state, nhwc_layout, backend=backend)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected)
@@ -1002,7 +1011,7 @@ def test_aspp_deformable():
     state = {shorten_weight_name(k): v for k, v in state.items()}
     x = to_nhwc(x)
 
-    result = workbench.invoke_test("biref_aspp_deformable", x, state)
+    result = workbench.invoke_test("biref_aspp_deformable", x, state, nhwc_layout)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected)
@@ -1048,7 +1057,7 @@ def test_basic_dec_blk():
     state = {shorten_weight_name(k): v for k, v in state.items()}
     x = to_nhwc(x)
 
-    result = workbench.invoke_test("biref_basic_dec_blk", x, state)
+    result = workbench.invoke_test("biref_basic_dec_blk", x, state, nhwc_layout)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected)
@@ -1300,7 +1309,7 @@ def test_decoder():
     state["x3"] = to_nhwc(x3)
     state["x4"] = to_nhwc(x4)
 
-    result = workbench.invoke_test("biref_decode", x, state)
+    result = workbench.invoke_test("biref_decode", x, state, nhwc_layout)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected)
diff --git a/tests/test_esrgan.py b/tests/test_esrgan.py
index 42f0a2e..92893f7 100644
--- a/tests/test_esrgan.py
+++ b/tests/test_esrgan.py
@@ -3,8 +3,7 @@
 from torch import nn
 
 from . import workbench
-from .workbench import to_nhwc, to_nchw, convert_to_nhwc
-from .workbench import input_tensor, generate_state
+from .workbench import input_tensor, generate_state, to_nhwc, to_nchw
 
 torch.set_printoptions(precision=3, sci_mode=False)
 
@@ -66,11 +65,7 @@ def test_upconv():
     x = input_tensor(1, 3, 2, 2)
     expected = block(x)
 
-    x = to_nhwc(x)
-    state = convert_to_nhwc(state, "1.")
     result = workbench.invoke_test("esrgan_upconv", x, state)
-    result = to_nchw(result)
-
     assert torch.allclose(result, expected)
 
 
@@ -107,11 +102,7 @@ def test_residual_dense_block():
     x = 0.1 * (input_tensor(1, 8, 6, 6) - 0.5)
     expected = block(x)
 
-    x = to_nhwc(x)
-    state = convert_to_nhwc(state, "conv")
     result = workbench.invoke_test("esrgan_residual_dense_block", x, state)
-    result = to_nchw(result)
-
     assert torch.allclose(result, expected)
 
 
@@ -141,12 +132,7 @@ def test_rrdb():
     x = 0.1 * input_tensor(1, 8, 6, 6)
     expected = block(x)
 
-    x = to_nhwc(x)
-    state = convert_to_nhwc(state, "conv")
-    result = to_nhwc(torch.zeros_like(expected))
     result = workbench.invoke_test("esrgan_rrdb", x, state)
-    result = to_nchw(result)
-
     assert torch.allclose(result, expected, atol=1e-5)
 
 
@@ -246,7 +232,6 @@ def test_rrdbnet():
     expected = model(x)
 
     x = to_nhwc(x)
-    state = convert_to_nhwc(state, ".")
     result = workbench.invoke_test("esrgan_rrdbnet", x, state)
     result = to_nchw(result)
 
diff --git a/tests/test_migan.py b/tests/test_migan.py
index 9b89131..a717615 100644
--- a/tests/test_migan.py
+++ b/tests/test_migan.py
@@ -10,6 +10,8 @@
 
 torch.set_printoptions(precision=3, sci_mode=False)
 
+nhwc_layout = dict(memory_layout="nhwc")
+
 
 class lrelu_agc:
     def __init__(self, alpha=0.2, gain=1, clamp=None):
@@ -89,9 +91,7 @@ def __init__(self, in_channels):
             stride=2,
         )
         f = setup_filter([1, 3, 3, 1], gain=1)
-        self.filter.weight = nn.Parameter(
-            f.repeat([*self.filter.weight.shape[:2], 1, 1])
-        )
+        self.filter.weight = nn.Parameter(f.repeat([*self.filter.weight.shape[:2], 1, 1]))
 
     def forward(self, x):
         x = self.filter(x)
@@ -106,7 +106,7 @@ def test_downsample2d():
     state = convert_to_nhwc(state, key="filter.")
 
     x = to_nhwc(x)
-    result = workbench.invoke_test("migan_downsample_2d", x, state)
+    result = workbench.invoke_test("migan_downsample_2d", x, state, nhwc_layout)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected)
@@ -118,9 +118,7 @@ def __init__(self, in_channels, resolution=None):
         self.nearest_up = nn.Upsample(scale_factor=2, mode="nearest")
         w = torch.tensor([[1.0, 0.0], [0.0, 0.0]], dtype=torch.float32)
         assert resolution is not None
-        self.register_buffer(
-            "filter_const", w.repeat(1, 1, resolution // 2, resolution // 2)
-        )
+        self.register_buffer("filter_const", w.repeat(1, 1, resolution // 2, resolution // 2))
 
         self.filter = nn.Conv2d(
             in_channels=in_channels,
@@ -131,9 +129,7 @@ def __init__(self, in_channels, resolution=None):
         )
 
         f = setup_filter([1, 3, 3, 1], gain=4)
-        self.filter.weight = nn.Parameter(
-            f.repeat([*self.filter.weight.shape[:2], 1, 1])
-        )
+        self.filter.weight = nn.Parameter(f.repeat([*self.filter.weight.shape[:2], 1, 1]))
 
     def forward(self, x):
         x = self.nearest_up(x)
@@ -151,14 +147,13 @@ def test_upsample2d():
     state = convert_to_nhwc(state, key="filter.")
 
     x = to_nhwc(x)
-    result = workbench.invoke_test("migan_upsample_2d", x, state)
+    result = workbench.invoke_test("migan_upsample_2d", x, state, nhwc_layout)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected)
 
 
 class SeparableConv2d(nn.Module):
-
     def __init__(
         self,
         in_channels,
@@ -242,7 +237,7 @@ def test_separable_conv2d():
     state = convert_to_nhwc(state, key="conv")
     state["noise_strength"] = torch.tensor([0.5])
     x = to_nhwc(x)
-    result = workbench.invoke_test("migan_separable_conv_2d", x, state)
+    result = workbench.invoke_test("migan_separable_conv_2d", x, state, nhwc_layout)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected)
@@ -295,16 +290,12 @@ def __init__(
         self.encode_res = [2**i for i in range(log2res, 1, -1)]
         self.ic_n = ic_n
 
-        for idx, (resi, resj) in enumerate(
-            zip(self.encode_res[:-1], self.encode_res[1:])
-        ):
+        for idx, (resi, resj) in enumerate(zip(self.encode_res[:-1], self.encode_res[1:])):
             hidden_ch_i = min(ch_base // resi, ch_max)
             hidden_ch_j = min(ch_base // resj, ch_max)
 
             if idx == 0:
-                block = EncoderBlock(
-                    hidden_ch_i, hidden_ch_j, rgb_n=ic_n, activation=activation
-                )
+                block = EncoderBlock(hidden_ch_i, hidden_ch_j, rgb_n=ic_n, activation=activation)
             else:
                 block = EncoderBlock(hidden_ch_i, hidden_ch_j, activation=activation)
 
@@ -342,7 +333,7 @@ def test_encoder():
         if "noise_strength" in k:
             state[k] = torch.tensor([0.5])
     x = to_nhwc(x)
-    result = workbench.invoke_test("migan_encoder", x, state)
+    result = workbench.invoke_test("migan_encoder", x, state, nhwc_layout)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected)
@@ -454,9 +445,7 @@ def __init__(
         self.block_res = block_res
 
         hidden_ch = min(ch_base // block_res[0], ch_max)
-        self.b4 = SynthesisBlockFirst(
-            hidden_ch, resolution=4, rgb_n=rgb_n, activation=activation
-        )
+        self.b4 = SynthesisBlockFirst(hidden_ch, resolution=4, rgb_n=rgb_n, activation=activation)
 
         for resi, resj in zip(block_res[:-1], block_res[1:]):
             hidden_ch_i = min(ch_base // resi, ch_max)
@@ -499,7 +488,7 @@ def test_synthesis():
             state[k] = torch.tensor([0.5])
     x = to_nhwc(x)
     state.update({f"feat{k}": to_nhwc(v) for k, v in enc_feats.items()})
-    result = workbench.invoke_test("migan_synthesis", x, state)
+    result = workbench.invoke_test("migan_synthesis", x, state, nhwc_layout)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected)
diff --git a/tests/test_mobile_sam.py b/tests/test_mobile_sam.py
index cb93313..6bcc090 100644
--- a/tests/test_mobile_sam.py
+++ b/tests/test_mobile_sam.py
@@ -10,6 +10,7 @@
 
 torch.set_printoptions(precision=2, linewidth=100, sci_mode=False)
 
+nhwc_layout = dict(memory_layout="nhwc")
 
 #
 # Image Encoder
@@ -49,7 +50,7 @@ def test_conv_2d_batch_norm(bias: bool):
     state = fuse_all_conv_2d_batch_norm(state)
     state = convert_to_nhwc(state)
     x = to_nhwc(x)
-    result = workbench.invoke_test("sam_conv_2d_batch_norm", x, state)
+    result = workbench.invoke_test("sam_conv_2d_batch_norm", x, state, nhwc_layout)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected)
@@ -94,7 +95,7 @@ def test_patch_embed():
     convert_to_nhwc(state)
     x = to_nhwc(x)
     result = to_nhwc(torch.zeros_like(expected))
-    result = workbench.invoke_test("sam_patch_embed", x, state)
+    result = workbench.invoke_test("sam_patch_embed", x, state, nhwc_layout)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected, rtol=0.001, atol=0.02)
@@ -126,7 +127,7 @@ def test_layer_norm_2d():
 
     x = to_nhwc(x)
     result = to_nhwc(torch.zeros_like(expected))
-    result = workbench.invoke_test("layer_norm", x, state)
+    result = workbench.invoke_test("layer_norm", x, state, nhwc_layout)
     result = to_nchw(result)
 
     assert torch.allclose(result, expected, rtol=0.001, atol=0.02)
@@ -188,7 +189,7 @@ def test_mb_conv():
     state = fuse_all_conv_2d_batch_norm(state)
     convert_to_nhwc(state)
     x = to_nhwc(x)
-    result = workbench.invoke_test("sam_mb_conv", x, state)
+    result = workbench.invoke_test("sam_mb_conv", x, state, nhwc_layout)
     result = to_nchw(result)
 
     # precision: ggml_gelu uses fp16 look-up table & tanh approximation
@@ -239,7 +240,7 @@ def test_patch_merging():
     state = fuse_all_conv_2d_batch_norm(state)
     convert_to_nhwc(state)
     x = to_nhwc(x)
-    result = workbench.invoke_test("sam_patch_merging", x, state)
+    result = workbench.invoke_test("sam_patch_merging", x, state, nhwc_layout)
     result = result.transpose(1, 2).reshape_as(expected)
 
     # precision: ggml_gelu uses fp16 look-up table & tanh approximation
@@ -492,7 +493,7 @@ def test_tiny_vit_block():
     ]
     state = fuse_all_conv_2d_batch_norm(state)
     state = convert_to_nhwc(state)
-    result = workbench.invoke_test("sam_tiny_vit_block", x, state)
+    result = workbench.invoke_test("sam_tiny_vit_block", x, state, nhwc_layout)
 
     assert torch.allclose(result, expected, rtol=0.001, atol=0.02)
 
@@ -1253,7 +1254,7 @@ def test_two_way_transformer():
     state["input_image_pe"] = to_nhwc(image_pe)
     state["input_point_embedding"] = point_embedding
     result_queries, result_keys = workbench.invoke_test(
-        "sam_two_way_transformer", image_embedding, state
+        "sam_two_way_transformer", image_embedding, state, nhwc_layout
     )
 
     assert torch.allclose(result_queries, expected_queries, atol=1e-6, rtol=1e-4)
@@ -1321,7 +1322,7 @@ def test_output_upscaling():
     expected = upscaling(x)
 
     x = to_nhwc(x)
-    result = workbench.invoke_test("sam_output_upscaling", x, state, backend="vulkan")
+    result = workbench.invoke_test("sam_output_upscaling", x, state, nhwc_layout, backend="vulkan")
     result = to_nchw(result)
 
     assert torch.allclose(result, expected, atol=1e-4, rtol=1e-2)  # fp16 weights
@@ -1460,7 +1461,7 @@ def test_predict_masks():
     state["input_dense_prompt"] = to_nhwc(dense_prompt_embeddings)
     result_masks = torch.zeros_like(expected_masks).contiguous()
     result_masks, result_iou_pred = workbench.invoke_test(
-        "sam_predict_masks", image_embeddings, state, backend="vulkan"
+        "sam_predict_masks", image_embeddings, state, nhwc_layout, backend="vulkan"
     )
 
     assert torch.allclose(result_masks, expected_masks, rtol=1e-2, atol=1e-2)
diff --git a/tests/testing.cpp b/tests/testing.cpp
index 7949e90..d92c327 100644
--- a/tests/testing.cpp
+++ b/tests/testing.cpp
@@ -1,5 +1,5 @@
-#include "testing.hpp"
-#include "visp/ml.hpp"
+#include "testing.h"
+#include "visp/ml.h"
 
 #include <chrono>
 #include <filesystem>
@@ -23,6 +23,7 @@ int main(int argc, char** argv) {
     int passed = 0;
     int failed = 0;
     int errors = 0;
+    int skipped = 0;
 
     std::string_view filter;
     bool exclude_gpu = false;
@@ -61,7 +62,7 @@ int main(int argc, char** argv) {
             }
         } catch (const visp::test_failure& e) {
             ++failed;
-            printf(" %s\n", "\033[31mFAILED\033[0m");
+            printf("%s %s\n", verbose ? "" : name, "\033[31mFAILED\033[0m");
             printf("  \033[90m%s:%d:\033[0m Assertion failed\n", e.file, e.line);
             printf("  \033[93m%s\033[0m\n", e.condition);
             if (e.eval) {
@@ -70,9 +71,14 @@ int main(int argc, char** argv) {
             if (!visp::extra_info.empty()) {
                 printf("  %s\n", visp::extra_info.c_str());
             }
+        } catch (const visp::test_skip&) {
+            ++skipped;
+            if (verbose) {
+                printf(" %s\n", "\033[33mSKIPPED\033[0m");
+            }
         } catch (const std::exception& e) {
             ++errors;
-            printf(" %s\n", "\033[31mERROR\033[0m");
+            printf("%s %s\n", verbose ? "" : name, "\033[31mERROR\033[0m");
             printf("  \033[90m%s:%d:\033[0m Unhandled exception\n", test.file, test.line);
             printf("  \033[93m%s\033[0m\n", e.what());
         }
@@ -107,6 +113,9 @@ int main(int argc, char** argv) {
     if (errors > 0) {
         printf("\033[31m%d errors, ", errors);
     }
+    if (skipped > 0) {
+        printf("\033[33m%d skipped, ", skipped);
+    }
     printf("\033[92m%d passed %sin %lldms\033[0m\n", passed, color, (long long)duration);
 
     return (failed > 0 || errors > 0) ? 1 : 0;
diff --git a/tests/testing.hpp b/tests/testing.h
similarity index 96%
rename from tests/testing.hpp
rename to tests/testing.h
index b074cd6..f9d91ae 100644
--- a/tests/testing.hpp
+++ b/tests/testing.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "util/string.hpp"
+#include "util/string.h"
 
 #include <filesystem>
 #include <vector>
@@ -56,6 +56,11 @@ struct test_directories {
 
 test_directories const& test_dir();
 
+// Use `throw test_skip{"reason"}` in a test case to skip it without failing
+struct test_skip {
+    char const* reason = nullptr;
+};
+
 float& test_tolerance_value();
 
 struct test_with_tolerance {
diff --git a/tests/workbench.cpp b/tests/workbench.cpp
index 393ac13..f31e83d 100644
--- a/tests/workbench.cpp
+++ b/tests/workbench.cpp
@@ -1,9 +1,9 @@
-#include "util/string.hpp"
-#include "visp/arch/birefnet.hpp"
-#include "visp/arch/esrgan.hpp"
-#include "visp/arch/migan.hpp"
-#include "visp/arch/mobile-sam.hpp"
-#include "visp/nn.hpp"
+#include "util/string.h"
+#include "visp/arch/birefnet.h"
+#include "visp/arch/esrgan.h"
+#include "visp/arch/migan.h"
+#include "visp/arch/mobile-sam.h"
+#include "visp/nn.h"
 
 #include <ggml-blas.h>
 #include <ggml-cpu.h>
@@ -11,6 +11,7 @@
 #include <ggml.h>
 
 #include <cassert>
+#include <cmath>
 #include <exception>
 #include <numeric>
 #include <span>
@@ -95,7 +96,8 @@ DEF(conv_2d_deform)(model_ref m, span<tensor> input, param_dict const& p) {
     tensor weight = m.weights("weight");
     tensor offset = m.weights("offset");
     tensor mask = m.find("mask");
-    return {conv_2d_deform(m, input[0], weight, offset, mask, 1, 1)};
+    int padding = p.get("padding", 1);
+    return {conv_2d_deform(m, input[0], weight, offset, mask, 1, padding)};
 }
 
 DEF(batch_norm_2d)(model_ref m, span<tensor> input, param_dict const& p) {
@@ -130,7 +132,7 @@ DEF(sam_mb_conv)(model_ref m, span<tensor> input, param_dict const& p) {
 }
 
 DEF(sam_patch_merging)(model_ref m, span<tensor> input, param_dict const& p) {
-    return {sam::patch_merging(m, input[0], 32)};
+    return {sam::patch_merging(m, input[0])};
 }
 
 DEF(sam_mlp)(model_ref m, span<tensor> input, param_dict const& p) {
@@ -239,8 +241,8 @@ DEF(biref_relative_position_index)(model_ref m, span<tensor> input, param_dict c
 DEF(biref_window_attention)(model_ref m, span<tensor> input, param_dict const& p) {
     int window_size = 3;
     tensor mask = m.find("mask");
-    auto rel_pos_index = birefnet::create_relative_position_index(m.weights_context, window_size);
-    ggml_backend_alloc_ctx_tensors(m.weights_context, workbench_backend());
+    auto rel_pos_index = birefnet::create_relative_position_index(m, window_size);
+    ggml_backend_alloc_ctx_tensors(m, workbench_backend());
     transfer_to_backend(rel_pos_index);
     return {birefnet::window_attention(m, input[0], mask, 2, window_size)};
 }
@@ -253,8 +255,8 @@ DEF(biref_swin_block)(model_ref m, span<tensor> input, param_dict const& p) {
     block.h = 6;
     block.shift = 0;
     tensor mask = m.find("mask");
-    auto rel_pos_index = birefnet::create_relative_position_index(m.weights_context, 3);
-    ggml_backend_alloc_ctx_tensors(m.weights_context, workbench_backend());
+    auto rel_pos_index = birefnet::create_relative_position_index(m, 3);
+    ggml_backend_alloc_ctx_tensors(m, workbench_backend());
     transfer_to_backend(rel_pos_index);
     return {birefnet::swin_block(m, input[0], mask, block)};
 }
@@ -275,9 +277,11 @@ DEF(biref_swin_layer)(model_ref m, span<tensor> input, param_dict const& p) {
     layer.n_heads = 2;
     layer.n_features = 8;
     layer.downsample = true;
-    auto rel_pos_index = birefnet::create_relative_position_index(m.weights_context, 3);
-    ggml_backend_alloc_ctx_tensors(m.weights_context, workbench_backend());
+    auto rel_pos_index = birefnet::create_relative_position_index(m, 3);
+    auto attn_mask = birefnet::create_attention_mask(m, 6, 6, 3);
+    ggml_backend_alloc_ctx_tensors(m, workbench_backend());
     transfer_to_backend(rel_pos_index);
+    transfer_to_backend(attn_mask);
     auto result = birefnet::swin_layer(m, input[0], 6, 6, layer, 3);
     ASSERT(result.w_down == 3 && result.h_down == 3);
     return {result.x_down};
@@ -293,11 +297,11 @@ DEF(biref_swin_transformer)(model_ref m, span<tensor> input, param_dict const& p
             swin_layer_t{2, 4, 8 * 4, true},
             swin_layer_t{2, 2, 8 * 8, false},
         }};
-    auto rel_pos_index = birefnet::create_relative_position_index(m.weights_context, 3);
+    auto rel_pos_index = birefnet::create_relative_position_index(m, 3);
     auto attn_masks = std::array{
-        birefnet::create_attention_mask(m.weights_context, 8, 8, 3), birefnet::create_attention_mask(m.weights_context, 4, 4, 3),
-        birefnet::create_attention_mask(m.weights_context, 2, 2, 3), birefnet::create_attention_mask(m.weights_context, 1, 1, 3)};
-    ggml_backend_alloc_ctx_tensors(m.weights_context, workbench_backend());
+        birefnet::create_attention_mask(m, 8, 8, 3), birefnet::create_attention_mask(m, 4, 4, 3),
+        birefnet::create_attention_mask(m, 2, 2, 3), birefnet::create_attention_mask(m, 1, 1, 3)};
+    ggml_backend_alloc_ctx_tensors(m, workbench_backend());
     transfer_to_backend(rel_pos_index);
     for (auto&& attn_mask : attn_masks) {
         transfer_to_backend(attn_mask);
@@ -520,7 +524,8 @@ void workbench_run(
 
     workbench& w = get_workbench();
     w.current_backend = backend_init(backend_type);
-    model_weights weights = model_init(w.current_backend, tensors.size() + 10);
+    model_weights weights = model_init(tensors.size() + 10);
+    weights.buffer_type = backend_type;
     compute_graph graph = compute_graph_init(1024);
     model_ref m(weights, graph);
 
@@ -541,8 +546,12 @@ void workbench_run(
     }
 
     param_dict test_params = build_dict(params);
-    test_case const& test = workbench_find_test(test_name);
+    std::string_view memory_layout = test_params.get("memory_layout", "whcn");
+    if (memory_layout == "cwhn" || memory_layout == "nhwc") {
+        m.flags |= model_build_flag::cwhn;
+    }
 
+    test_case const& test = workbench_find_test(test_name);
     std::vector<tensor> outputs = test.func(m, inputs, test_params);
     for (tensor& out : outputs) {
         out = compute_graph_output(m, ggml_cont(m, out));