Acly · Acly · Aug 13, 2025 · Aug 1, 2025 · Aug 1, 2025 · Aug 1, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -37,6 +37,13 @@ jobs:
         if: matrix.os == 'windows-latest'
         uses: microsoft/setup-msbuild@v2
 
+      - name: Vulkan SDK (Windows)
+        if: matrix.os == 'windows-latest'
+        uses: humbletim/install-vulkan-sdk@v1.2
+        with:
+          version: 1.4.309.0
+          cache: true
+
       - name: Configure (Linux)
         if: matrix.os == 'ubuntu-22.04'
         run: >
@@ -52,6 +59,7 @@ jobs:
           cmake . -B build -A x64
           -D CMAKE_BUILD_TYPE=Release
           -D VISP_CI=ON
+          -D VISP_VULKAN=ON
 
       - name: Configure (MacOS)
         if: matrix.os == 'macos-14'
@@ -74,8 +82,7 @@ jobs:
       #     export GGML_VK_VISIBLE_DEVICES=0
       #     ctest --verbose
 
-      - name: Test CPU
-        if: matrix.os != 'ubuntu-22.04'
+      - name: Test
         working-directory: ./build
         run: ctest --verbose -C Release
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,6 +16,8 @@ if(PROJECT_IS_TOP_LEVEL)
   set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 endif()
 
+# Configure assertions
+
 if(VISP_DEV)
   set(VISP_ASSERT "VISP_ASSERT_BREAK")
 elseif(VISP_CI)
@@ -28,6 +30,8 @@ elseif(CMAKE_BUILD_TYPE)
   endif()
 endif()
 
+# Configure address sanitizer (Clang only)
+
 if(VISP_ASAN)
   if(MSVC)
     add_compile_options(/fsanitize=address)
@@ -38,12 +42,20 @@ if(VISP_ASAN)
   endif()
 endif()
 
-if(MSVC)  
-  add_compile_options(/Zi /utf-8)
-  add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
-  add_link_options(/DEBUG) # Enable debug symbols also in release builds
+# Windows/MSVC specific defaults
+
+if(MSVC)
+  list(APPEND VISP_COMP_OPTIONS /utf-8)
+  list(APPEND VISP_DEFINITIONS _CRT_SECURE_NO_WARNINGS)
+  if(PROJECT_IS_TOP_LEVEL)
+    # Enable debug symbols also in release builds
+    list(APPEND VISP_COMP_OPTIONS /Zi)
+    list(APPEND VISP_LINK_OPTIONS /DEBUG)
+  endif()
 endif()
 
+# Configure warnings
+
 if(VISP_DEV OR VISP_CI)
   if(MSVC)
     set(VISP_WARNINGS /W4 /WX /wd4251)
@@ -59,7 +71,7 @@ add_subdirectory(depend/stb)
 if(VISP_FMT_LIB)
   add_subdirectory(depend/fmt)
   set(VISP_FMT_LINK fmt::fmt)
-  set(VISP_FMT_DEFS VISP_FMT_LIB)
+  list(APPEND VISP_DEFINITIONS VISP_FMT_LIB)
 endif()
 
 set(GGML_VULKAN ${VISP_VULKAN})

diff --git a/README.md b/README.md
@@ -48,11 +48,11 @@ Pass `--composite output.png` to composite input and mask. Use `--help` for more
 #### API
 
 ```c++
-#include <visp/vision.hpp>
+#include <visp/vision.h>
 using namespace visp;
 
 void main() {
-  backend   cpu = backend_init(backend_type::cpu);
+  backend_device cpu = backend_init(backend_type::cpu);
   sam_model sam = sam_load_model("MobileSAM-F16.gguf", cpu);
 
   image_data input_image = image_load("input.jpg");
@@ -180,32 +180,32 @@ as other frameworks for inference speed, but with:
 * CPU: AMD Ryzen 5 5600X (6 cores)
 * GPU: NVIDIA GeForce RTX 4070
 
-#### MobileSAM, 1024x1024, encode + decode
+#### MobileSAM, 1024x1024
 
-|      |      | _vision.cpp_ |     PyTorch | ONNX Runtime |
-| :--- | :--- | -----------: | ----------: | -----------: |
-| cpu  | f32  |  632 + 37 ms | 559 + 42 ms |  728 + 87 ms |
-| gpu  | f16  |   18 +  3 ms |  10 +  6 ms |              |
+|      |      | _vision.cpp_ | PyTorch | ONNX Runtime |
+| :--- | :--- | -----------: | ------: | -----------: |
+| cpu  | f32  |       669 ms |  601 ms |       805 ms |
+| gpu  | f16  |        19 ms |   16 ms |              |
 
 #### BiRefNet, 1024x1024
 
 | Model |      |      | _vision.cpp_ |  PyTorch | ONNX Runtime |
 | :---- | :--- | :--- | -----------: | -------: | -----------: |
 | Full  | cpu  | f32  |     16333 ms | 18800 ms |              |
-| Full  | gpu  | f16  |       380 ms |   140 ms |              |
+| Full  | gpu  | f16  |       243 ms |   140 ms |              |
 | Lite  | cpu  | f32  |      4505 ms | 10900 ms |      6978 ms |
-| Lite  | gpu  | f16  |       204 ms |    59 ms |       967 ms |
+| Lite  | gpu  | f16  |        86 ms |    59 ms |              |
 
 #### MI-GAN, 512x512
 
 | Model       |      |      | _vision.cpp_ | PyTorch |
 | :---------- | :--- | :--- | -----------: | ------: |
 | 512-places2 | cpu  | f32  |       523 ms |  637 ms |
-| 512-places2 | gpu  | f16  |        24 ms |   17 ms |
+| 512-places2 | gpu  | f16  |        21 ms |   17 ms |
 
 #### Setup
 
-* vision.cpp: using vision-bench, GPU via Vulkan, eg. `vision-bench sam cpu`
+* vision.cpp: using vision-bench, GPU via Vulkan, eg. `vision-bench -m sam -b cpu`
 * PyTorch: v2.7.1+cu128, eager eval, GPU via CUDA, average n iterations after warm-up
 
 ## Dependencies (integrated)

diff --git a/depend/ggml b/depend/ggml
diff --git a/docs/model-implementation-guide.md b/docs/model-implementation-guide.md
@@ -51,7 +51,7 @@ PyTorch code.
 The great thing about ggml is, you can always follow-reference in your IDE and
 see almost immediately how things are implemented. It is small enough to be
 compiled along-side, so you can step into functions, add prints, etc. If some
-functionality is missing, you can quickly hack it in. Make sure to use.
+functionality is missing, you can quickly hack it in. Make sure to use that.
 
 ### vision.cpp
 
@@ -68,7 +68,7 @@ tensor some_module(model_ref m, tensor x, ...)
 Here `tensor` is short for `ggml_tensor *`, which can be a weight or the result
 of an operation. The `model_ref` is used to build a compute graph by passing it
 to ggml functions as replacement for `ggml_context *`. It keeps track of parent
-modules and provides a way to access model weights. 
+modules and provides a way to access model weights by name. 
 
 `some_module` typically represents the forward function of a PyTorch
 `nn.Module`. The whole model can be defined with reusable functions.
@@ -108,7 +108,8 @@ be converted. It's usually a good opportunity to optimize for inference, throw
 away training-only stuff, maybe fuse some operations, or convert to a faster
 memory layout.
 
-If you haven't already, setup a Python environment (just running `uv sync` will do).
+If you haven't already, setup a Python environment (I use
+[uv](https://docs.astral.sh/uv/) and simply run `uv sync`).
 
 Open `scripts/convert.py` and add a conversion function similar to the existing
 ones. A 1:1 conversion is very simple:

diff --git a/include/visp/image.hpp → include/visp/image.h b/include/visp/image.hpp → include/visp/image.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "visp/util.hpp"
+#include "visp/util.h"
 
 #include <memory>
 #include <span>
@@ -97,9 +97,12 @@ struct image_data {
     std::unique_ptr<uint8_t[]> data;
 };
 
-// Allocate image data. Pixels are not initialized.
+// Allocate image data. Memory is not initialized!
 VISP_API image_data image_alloc(i32x2 extent, image_format format);
 
+// Set all pixels to zero.
+void image_clear(image_span const&);
+
 // Load image from file (PNG, JPEG, etc.)
 VISP_API image_data image_load(char const* filepath);
 
@@ -194,6 +197,7 @@ struct VISP_API tile_layout {
 VISP_API tile_layout tile_scale(tile_layout const&, int scale);
 
 // Merge a tile into the destination image. Both images must be rgb_f32 format.
+// Blends pixels from `tile` and `dst` in overlap regions. `dst` must be all zeros initially.
 VISP_API void tile_merge(
     image_view const& tile, image_span const& dst, i32x2 tile_coord, tile_layout const& layout);
 

diff --git a/include/visp/ml.hpp → include/visp/ml.h b/include/visp/ml.hpp → include/visp/ml.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "visp/image.hpp"
-#include "visp/util.hpp"
+#include "visp/image.h"
+#include "visp/util.h"
 
 #include <ggml-alloc.h>
 #include <ggml-backend.h>
@@ -13,6 +13,8 @@
 #include <limits>
 #include <memory>
 #include <span>
+#include <string>
+#include <string_view>
 #include <vector>
 
 namespace visp {
@@ -21,64 +23,111 @@ using std::span;
 using tensor_name = fixed_string<GGML_MAX_NAME>;
 using tensor = ggml_tensor*;
 
+// Memory layout, especially for weights of 2D operations like convolutions
+enum tensor_data_layout { unknown, whcn, cwhn };
+
 //
-// Backend
+// Backend device - represents the compute hardware
 
 enum class backend_type { cpu = 1, gpu = 2 };
 
 // True if the backend library is loaded and has at least one supported device.
 VISP_API bool backend_is_available(backend_type);
 
-struct VISP_API backend_device {
+struct backend_device {
     ggml_backend_ptr handle;
     ggml_backend_dev_t device;
 
-    backend_type type() const;
-    ggml_type preferred_float_type() const;
-    size_t total_memory() const;
+    VISP_API backend_type type() const;
+    VISP_API ggml_type preferred_float_type() const;
+    VISP_API tensor_data_layout preferred_layout() const;
+    VISP_API size_t total_memory() const;
 
     operator ggml_backend_t() const { return handle.get(); }
 };
 
+// Initialize a backend device, automatically tries to pick the "best" available.
 VISP_API backend_device backend_init();
+
+// Initialize the most suited device that matches the specified backend type.
 VISP_API backend_device backend_init(backend_type);
 
+// Set number of threads used by the backend (CPU only).
 VISP_API void backend_set_n_threads(backend_device&, int n_threads);
 
+//
+// Model build flags - backend capabilities, model configuration and optimization
+
+enum class model_build_flag {
+    // clang-format off
+    cwhn                = 1 << 0,
+    conv_2d_direct_cwhn = 1 << 1,
+    concat_n            = 1 << 2,
+    f16_conv_transpose  = 1 << 3,
+    window_partition    = 1 << 4
+}; // clang-format on
+
+using model_build_flags = flags<model_build_flag>;
+
+VISP_API model_build_flags backend_default_flags(backend_type);
+
+//
+// Model file - holds the contents of a GGUF file
+
+struct model_file {
+    gguf_context_ptr gguf;
+    ggml_context_ptr data;
+    std::string path;
+
+    VISP_API int64_t n_tensors() const;
+    VISP_API std::string_view arch() const;
+    VISP_API tensor_data_layout tensor_layout() const;
+
+    VISP_API int64_t key(char const* name) const;
+    VISP_API int get_int(char const* name) const;
+    VISP_API std::string_view get_string(char const* name) const;
+};
+
+// Opens a .gguf file and reads its contents into memory.
+VISP_API model_file model_load(char const* filepath);
+
 //
 // Model weights
 //
 // * stores the tensor descriptors of model weights
 // * holds the backend buffers for model weight data
 // * holds buffers for extra tensors such as pre-computed lookup tables
 
-struct VISP_API model_weights {
+struct model_weights {
     ggml_context_ptr context;
     backend_type buffer_type = backend_type::cpu;
     ggml_backend_buffer_ptr weights_buffer;
     std::vector<ggml_backend_buffer_ptr> extra_buffers;
+    model_build_flags flags;
 
-    ggml_type float_type() const;
+    VISP_API ggml_type float_type() const;
 
     operator ggml_context*() const { return context.get(); }
 };
 
 // Creates a GGML context with storage for a fixed number of tensors.
 // Does not allocate any backend buffers.
-VISP_API model_weights model_init(backend_device const&, size_t n_tensors);
-
-struct model_load_params {
-    ggml_type float_type = GGML_TYPE_COUNT; // default: use type stored in GGUF file
-    int n_extra_tensors = 0;                // number of extra tensors to allocate in the context
-};
-
-// Loads model weights from a GGUF file and transfers them to backend buffers.
-VISP_API model_weights model_load(char const* filepath, backend_device const&, model_load_params = {});
+VISP_API model_weights model_init(size_t n_tensors);
 
 // Allocates backend buffers for the model weights if needed. Does not transfer data.
 // Returns false and does nothing if all tensors already have an associated backend buffer.
 VISP_API bool model_allocate(model_weights&, backend_device const&);
 
+// Adds model weights contained in `file` to `weights`. Allocates backend buffers for the
+// weights on `device` and transfers the data to the device buffer.
+// Optionally converts float weights to the specified data type during transfer.
+VISP_API void model_transfer(
+    model_file const& file,
+    model_weights& weights,
+    backend_device const& device,
+    ggml_type float_type = GGML_TYPE_COUNT,
+    tensor_data_layout = tensor_data_layout::unknown);
+
 //
 // Compute graph - wrapper for ggml_cgraph and its associated backend memory
 
@@ -107,18 +156,6 @@ VISP_API void compute(compute_graph const&, backend_device const&);
 //   to support nested modules
 // * pass anywhere ggml_context* is expected while building the graph
 
-enum class model_build_flag {
-    // clang-format off
-    cwhn                = 1 << 0,
-    conv_2d_direct      = 1 << 1,
-    fused_batch_norm    = 1 << 2,
-    concat_n            = 1 << 3,
-    f16_conv_transpose  = 1 << 4,
-    window_partition    = 1 << 5
-}; // clang-format on
-
-using model_build_flags = flags<model_build_flag>;
-
 struct VISP_API model_ref {
     ggml_context* weights_context = nullptr;
     ggml_context* graph_context = nullptr;
@@ -127,8 +164,8 @@ struct VISP_API model_ref {
     tensor_name prefix;
 
     model_ref() = default;
-    model_ref(model_weights& m);
-    model_ref(model_weights& m, compute_graph& g);
+    model_ref(model_weights&);
+    model_ref(model_weights&, compute_graph&);
 
     explicit model_ref(
         ggml_context* weights_context,
@@ -247,7 +284,7 @@ struct swin_params {
 
 extern swin_params const swin_t_params;
 extern swin_params const swin_l_params;
-VISP_API swin_params swin_detect_params(model_ref);
+VISP_API swin_params swin_detect_params(model_file const&);
 
 //
 // implementation
@@ -256,4 +293,8 @@ constexpr model_build_flags operator|(model_build_flag lhs, model_build_flag rhs
     return model_build_flags(uint32_t(lhs) | uint32_t(rhs));
 }
 
+constexpr model_build_flags operator~(model_build_flag f) {
+    return ~model_build_flags(f);
+}
+
 } // namespace visp
+1 −0		.github/pull_request_template.md
+15 −33		.github/workflows/ci.yml
+4 −1		CMakeLists.txt
+2 −0		CONTRIBUTING.md
+127 −90		cmake/ggml-config.cmake.in
+1 −1		examples/simple/simple-backend.cpp
+1 −1		scripts/sync-llama.last
+1 −1		scripts/sync-whisper.last
+8 −5		src/ggml-backend.cpp
+3 −1		src/ggml-cann/acl_tensor.cpp
+178 −35		src/ggml-cann/aclnn_ops.cpp
+143 −25		src/ggml-cann/aclnn_ops.h
+151 −26		src/ggml-cann/ggml-cann.cpp
+5 −2		src/ggml-cpu/CMakeLists.txt
+14 −0		src/ggml-cpu/arch-fallback.h
+71 −596		src/ggml-cpu/arch/arm/quants.c
+14 −286		src/ggml-cpu/arch/arm/repack.cpp
+91 −570		src/ggml-cpu/arch/loongarch/quants.c
+103 −596		src/ggml-cpu/arch/powerpc/quants.c
+55 −341		src/ggml-cpu/arch/riscv/quants.c
+3 −58		src/ggml-cpu/arch/riscv/repack.cpp
+62 −305		src/ggml-cpu/arch/s390/quants.c
+54 −314		src/ggml-cpu/arch/wasm/quants.c
+94 −673		src/ggml-cpu/arch/x86/quants.c
+3,202 −239		src/ggml-cpu/arch/x86/repack.cpp
+109 −12		src/ggml-cpu/kleidiai/kernels.cpp
+3 −0		src/ggml-cpu/kleidiai/kernels.h
+88 −10		src/ggml-cpu/kleidiai/kleidiai.cpp
+176 −10		src/ggml-cpu/ops.cpp
+263 −0		src/ggml-cpu/repack.cpp
+11 −0		src/ggml-cpu/repack.h
+3 −3		src/ggml-cuda/CMakeLists.txt
+54 −27		src/ggml-cuda/common.cuh
+64 −17		src/ggml-cuda/convert.cu
+10 −36		src/ggml-cuda/cpy-utils.cuh
+40 −63		src/ggml-cuda/cpy.cu
+129 −56		src/ggml-cuda/fattn-common.cuh
+44 −54		src/ggml-cuda/fattn-mma-f16.cuh
+15 −34		src/ggml-cuda/fattn-tile-f16.cu
+30 −49		src/ggml-cuda/fattn-tile-f32.cu
+38 −64		src/ggml-cuda/fattn-vec-f16.cuh
+29 −55		src/ggml-cuda/fattn-vec-f32.cuh
+16 −35		src/ggml-cuda/fattn-wmma-f16.cu
+5 −14		src/ggml-cuda/fattn.cu
+109 −23		src/ggml-cuda/ggml-cuda.cu
+45 −35		src/ggml-cuda/im2col.cu
+111 −3		src/ggml-cuda/mma.cuh
+21 −3		src/ggml-cuda/mmq.cu
+1,156 −697		src/ggml-cuda/mmq.cuh
+92 −5		src/ggml-cuda/norm.cu
+2 −0		src/ggml-cuda/norm.cuh
+67 −0		src/ggml-cuda/roll.cu
+5 −0		src/ggml-cuda/roll.cuh
+5 −18		src/ggml-cuda/set-rows.cu
+34 −0		src/ggml-cuda/softcap.cu
+5 −0		src/ggml-cuda/softcap.cuh
+21 −7		src/ggml-cuda/vendors/hip.h
+2 −2		src/ggml-cuda/vendors/musa.h
+4 −0		src/ggml-hip/CMakeLists.txt
+1 −0		src/ggml-metal/ggml-metal-impl.h
+23 −5		src/ggml-metal/ggml-metal.m
+142 −40		src/ggml-metal/ggml-metal.metal
+18 −4		src/ggml-musa/CMakeLists.txt
+4 −0		src/ggml-opencl/CMakeLists.txt
+562 −84		src/ggml-opencl/ggml-opencl.cpp
+73 −0		src/ggml-opencl/kernels/add.cl
+185 −0		src/ggml-opencl/kernels/conv2d.cl
+176 −0		src/ggml-opencl/kernels/conv2d_f16_f32.cl
+66 −0		src/ggml-opencl/kernels/div.cl
+1 −1		src/ggml-opencl/kernels/im2col_f16.cl
+1 −1		src/ggml-opencl/kernels/im2col_f32.cl
+73 −0		src/ggml-opencl/kernels/mul.cl
+132 −0		src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl
+133 −0		src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl
+79 −0		src/ggml-opencl/kernels/rms_norm.cl
+66 −0		src/ggml-opencl/kernels/sub.cl
+4 −4		src/ggml-rpc/ggml-rpc.cpp
+1 −0		src/ggml-sycl/backend.hpp
+0 −212		src/ggml-sycl/cpy.cpp
+213 −1		src/ggml-sycl/cpy.hpp
+69 −215		src/ggml-sycl/ggml-sycl.cpp
+1 −1		src/ggml-sycl/im2col.cpp
+133 −0		src/ggml-sycl/quantize.hpp
+8 −9		src/ggml-sycl/quants.hpp
+94 −0		src/ggml-sycl/set_rows.cpp
+2 −6		src/ggml-sycl/vecdotq.hpp
+567 −64		src/ggml-vulkan/ggml-vulkan.cpp
+376 −0		src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+2 −4		src/ggml-vulkan/vulkan-shaders/im2col.comp
+98 −1		src/ggml-vulkan/vulkan-shaders/im2col_deform.comp
+11 −7		src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp
+8 −2		src/ggml-vulkan/vulkan-shaders/rms_norm.comp
+17 −2		src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+15 −10		src/ggml.c
+357 −36		tests/test-backend-ops.cpp