Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0983209
cli: fix missing enumeration in switch
Acly Aug 1, 2025
fc32d2e
build: clean up some global cmake directives
Acly Aug 1, 2025
8c7e923
ci: enable building vulkan backend for windows
Acly Aug 1, 2025
8ef8ca3
project: rename .hpp -> .h
Acly Aug 1, 2025
9613ebf
tests: fix esrgan benchmark
Acly Aug 4, 2025
256340a
ml: detect model parameters at conversion time and write the config t…
Acly Aug 5, 2025
3c0f5ca
ml: some tools to make 2d memory layout more configurable
Acly Aug 5, 2025
d0d78fe
ml: detect tensor layout during model transfer
Acly Aug 5, 2025
dcaf936
ml: use a compute graph to convert weights at loading time
Acly Aug 6, 2025
94ea227
esrgan: use cwhn layout on CPU, whcn layout on GPU
Acly Aug 6, 2025
9297d8c
birefnet: allow running with cwhn and whcn layouts
Acly Aug 7, 2025
8399408
tests: make whcn layout the default, explicitly request cwhn via para…
Acly Aug 7, 2025
028a51e
sam: support running with cwhn and whcn layout for comparisons
Acly Aug 8, 2025
43e24a0
ggml: use conv_2d_direct for vulkan
Acly Aug 8, 2025
9949e91
tests: birefnet gpu is no longer an issue; update performance numbers
Acly Aug 8, 2025
f82d2c6
birefnet: support HR and Dynamic variants
Acly Aug 11, 2025
87f3d76
birefnet: switch to direct version of conv_2d_deform on Vulkan
Acly Aug 12, 2025
b4ecd12
tests: update models & reference images
Acly Aug 12, 2025
7946b55
esrgan: fix tile merge not working sporadically because target image …
Acly Aug 13, 2025
ddeea58
add some missing includes
Acly Aug 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ jobs:
if: matrix.os == 'windows-latest'
uses: microsoft/setup-msbuild@v2

- name: Vulkan SDK (Windows)
if: matrix.os == 'windows-latest'
uses: humbletim/install-vulkan-sdk@v1.2
with:
version: 1.4.309.0
cache: true

- name: Configure (Linux)
if: matrix.os == 'ubuntu-22.04'
run: >
Expand All @@ -52,6 +59,7 @@ jobs:
cmake . -B build -A x64
-D CMAKE_BUILD_TYPE=Release
-D VISP_CI=ON
-D VISP_VULKAN=ON

- name: Configure (MacOS)
if: matrix.os == 'macos-14'
Expand All @@ -74,8 +82,7 @@ jobs:
# export GGML_VK_VISIBLE_DEVICES=0
# ctest --verbose

- name: Test CPU
if: matrix.os != 'ubuntu-22.04'
- name: Test
working-directory: ./build
run: ctest --verbose -C Release

Expand Down
22 changes: 17 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ if(PROJECT_IS_TOP_LEVEL)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
endif()

# Configure assertions

if(VISP_DEV)
set(VISP_ASSERT "VISP_ASSERT_BREAK")
elseif(VISP_CI)
Expand All @@ -28,6 +30,8 @@ elseif(CMAKE_BUILD_TYPE)
endif()
endif()

# Configure address sanitizer (Clang only)

if(VISP_ASAN)
if(MSVC)
add_compile_options(/fsanitize=address)
Expand All @@ -38,12 +42,20 @@ if(VISP_ASAN)
endif()
endif()

if(MSVC)
add_compile_options(/Zi /utf-8)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
add_link_options(/DEBUG) # Enable debug symbols also in release builds
# Windows/MSVC specific defaults

if(MSVC)
list(APPEND VISP_COMP_OPTIONS /utf-8)
list(APPEND VISP_DEFINITIONS _CRT_SECURE_NO_WARNINGS)
if(PROJECT_IS_TOP_LEVEL)
# Enable debug symbols also in release builds
list(APPEND VISP_COMP_OPTIONS /Zi)
list(APPEND VISP_LINK_OPTIONS /DEBUG)
endif()
endif()

# Configure warnings

if(VISP_DEV OR VISP_CI)
if(MSVC)
set(VISP_WARNINGS /W4 /WX /wd4251)
Expand All @@ -59,7 +71,7 @@ add_subdirectory(depend/stb)
if(VISP_FMT_LIB)
add_subdirectory(depend/fmt)
set(VISP_FMT_LINK fmt::fmt)
set(VISP_FMT_DEFS VISP_FMT_LIB)
list(APPEND VISP_DEFINITIONS VISP_FMT_LIB)
endif()

set(GGML_VULKAN ${VISP_VULKAN})
Expand Down
22 changes: 11 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ Pass `--composite output.png` to composite input and mask. Use `--help` for more
#### API

```c++
#include <visp/vision.hpp>
#include <visp/vision.h>
using namespace visp;

void main() {
backend cpu = backend_init(backend_type::cpu);
backend_device cpu = backend_init(backend_type::cpu);
sam_model sam = sam_load_model("MobileSAM-F16.gguf", cpu);

image_data input_image = image_load("input.jpg");
Expand Down Expand Up @@ -180,32 +180,32 @@ as other frameworks for inference speed, but with:
* CPU: AMD Ryzen 5 5600X (6 cores)
* GPU: NVIDIA GeForce RTX 4070

#### MobileSAM, 1024x1024, encode + decode
#### MobileSAM, 1024x1024

| | | _vision.cpp_ | PyTorch | ONNX Runtime |
| :--- | :--- | -----------: | ----------: | -----------: |
| cpu | f32 | 632 + 37 ms | 559 + 42 ms | 728 + 87 ms |
| gpu | f16 | 18 + 3 ms | 10 + 6 ms | |
| | | _vision.cpp_ | PyTorch | ONNX Runtime |
| :--- | :--- | -----------: | ------: | -----------: |
| cpu | f32 | 669 ms | 601 ms | 805 ms |
| gpu | f16 | 19 ms | 16 ms | |

#### BiRefNet, 1024x1024

| Model | | | _vision.cpp_ | PyTorch | ONNX Runtime |
| :---- | :--- | :--- | -----------: | -------: | -----------: |
| Full | cpu | f32 | 16333 ms | 18800 ms | |
| Full | gpu | f16 | 380 ms | 140 ms | |
| Full | gpu | f16 | 243 ms | 140 ms | |
| Lite | cpu | f32 | 4505 ms | 10900 ms | 6978 ms |
| Lite | gpu | f16 | 204 ms | 59 ms | 967 ms |
| Lite | gpu | f16 | 86 ms | 59 ms | |

#### MI-GAN, 512x512

| Model | | | _vision.cpp_ | PyTorch |
| :---------- | :--- | :--- | -----------: | ------: |
| 512-places2 | cpu | f32 | 523 ms | 637 ms |
| 512-places2 | gpu | f16 | 24 ms | 17 ms |
| 512-places2 | gpu | f16 | 21 ms | 17 ms |

#### Setup

* vision.cpp: using vision-bench, GPU via Vulkan, eg. `vision-bench sam cpu`
* vision.cpp: using vision-bench, GPU via Vulkan, eg. `vision-bench -m sam -b cpu`
* PyTorch: v2.7.1+cu128, eager eval, GPU via CUDA, average n iterations after warm-up

## Dependencies (integrated)
Expand Down
2 changes: 1 addition & 1 deletion depend/ggml
Submodule ggml updated 95 files
+1 −0 .github/pull_request_template.md
+15 −33 .github/workflows/ci.yml
+4 −1 CMakeLists.txt
+2 −0 CONTRIBUTING.md
+127 −90 cmake/ggml-config.cmake.in
+1 −1 examples/simple/simple-backend.cpp
+1 −1 scripts/sync-llama.last
+1 −1 scripts/sync-whisper.last
+8 −5 src/ggml-backend.cpp
+3 −1 src/ggml-cann/acl_tensor.cpp
+178 −35 src/ggml-cann/aclnn_ops.cpp
+143 −25 src/ggml-cann/aclnn_ops.h
+151 −26 src/ggml-cann/ggml-cann.cpp
+5 −2 src/ggml-cpu/CMakeLists.txt
+14 −0 src/ggml-cpu/arch-fallback.h
+71 −596 src/ggml-cpu/arch/arm/quants.c
+14 −286 src/ggml-cpu/arch/arm/repack.cpp
+91 −570 src/ggml-cpu/arch/loongarch/quants.c
+103 −596 src/ggml-cpu/arch/powerpc/quants.c
+55 −341 src/ggml-cpu/arch/riscv/quants.c
+3 −58 src/ggml-cpu/arch/riscv/repack.cpp
+62 −305 src/ggml-cpu/arch/s390/quants.c
+54 −314 src/ggml-cpu/arch/wasm/quants.c
+94 −673 src/ggml-cpu/arch/x86/quants.c
+3,202 −239 src/ggml-cpu/arch/x86/repack.cpp
+109 −12 src/ggml-cpu/kleidiai/kernels.cpp
+3 −0 src/ggml-cpu/kleidiai/kernels.h
+88 −10 src/ggml-cpu/kleidiai/kleidiai.cpp
+176 −10 src/ggml-cpu/ops.cpp
+263 −0 src/ggml-cpu/repack.cpp
+11 −0 src/ggml-cpu/repack.h
+3 −3 src/ggml-cuda/CMakeLists.txt
+54 −27 src/ggml-cuda/common.cuh
+64 −17 src/ggml-cuda/convert.cu
+10 −36 src/ggml-cuda/cpy-utils.cuh
+40 −63 src/ggml-cuda/cpy.cu
+129 −56 src/ggml-cuda/fattn-common.cuh
+44 −54 src/ggml-cuda/fattn-mma-f16.cuh
+15 −34 src/ggml-cuda/fattn-tile-f16.cu
+30 −49 src/ggml-cuda/fattn-tile-f32.cu
+38 −64 src/ggml-cuda/fattn-vec-f16.cuh
+29 −55 src/ggml-cuda/fattn-vec-f32.cuh
+16 −35 src/ggml-cuda/fattn-wmma-f16.cu
+5 −14 src/ggml-cuda/fattn.cu
+109 −23 src/ggml-cuda/ggml-cuda.cu
+45 −35 src/ggml-cuda/im2col.cu
+111 −3 src/ggml-cuda/mma.cuh
+21 −3 src/ggml-cuda/mmq.cu
+1,156 −697 src/ggml-cuda/mmq.cuh
+92 −5 src/ggml-cuda/norm.cu
+2 −0 src/ggml-cuda/norm.cuh
+67 −0 src/ggml-cuda/roll.cu
+5 −0 src/ggml-cuda/roll.cuh
+5 −18 src/ggml-cuda/set-rows.cu
+34 −0 src/ggml-cuda/softcap.cu
+5 −0 src/ggml-cuda/softcap.cuh
+21 −7 src/ggml-cuda/vendors/hip.h
+2 −2 src/ggml-cuda/vendors/musa.h
+4 −0 src/ggml-hip/CMakeLists.txt
+1 −0 src/ggml-metal/ggml-metal-impl.h
+23 −5 src/ggml-metal/ggml-metal.m
+142 −40 src/ggml-metal/ggml-metal.metal
+18 −4 src/ggml-musa/CMakeLists.txt
+4 −0 src/ggml-opencl/CMakeLists.txt
+562 −84 src/ggml-opencl/ggml-opencl.cpp
+73 −0 src/ggml-opencl/kernels/add.cl
+185 −0 src/ggml-opencl/kernels/conv2d.cl
+176 −0 src/ggml-opencl/kernels/conv2d_f16_f32.cl
+66 −0 src/ggml-opencl/kernels/div.cl
+1 −1 src/ggml-opencl/kernels/im2col_f16.cl
+1 −1 src/ggml-opencl/kernels/im2col_f32.cl
+73 −0 src/ggml-opencl/kernels/mul.cl
+132 −0 src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl
+133 −0 src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl
+79 −0 src/ggml-opencl/kernels/rms_norm.cl
+66 −0 src/ggml-opencl/kernels/sub.cl
+4 −4 src/ggml-rpc/ggml-rpc.cpp
+1 −0 src/ggml-sycl/backend.hpp
+0 −212 src/ggml-sycl/cpy.cpp
+213 −1 src/ggml-sycl/cpy.hpp
+69 −215 src/ggml-sycl/ggml-sycl.cpp
+1 −1 src/ggml-sycl/im2col.cpp
+133 −0 src/ggml-sycl/quantize.hpp
+8 −9 src/ggml-sycl/quants.hpp
+94 −0 src/ggml-sycl/set_rows.cpp
+2 −6 src/ggml-sycl/vecdotq.hpp
+567 −64 src/ggml-vulkan/ggml-vulkan.cpp
+376 −0 src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+2 −4 src/ggml-vulkan/vulkan-shaders/im2col.comp
+98 −1 src/ggml-vulkan/vulkan-shaders/im2col_deform.comp
+11 −7 src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp
+8 −2 src/ggml-vulkan/vulkan-shaders/rms_norm.comp
+17 −2 src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+15 −10 src/ggml.c
+357 −36 tests/test-backend-ops.cpp
7 changes: 4 additions & 3 deletions docs/model-implementation-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ PyTorch code.
The great thing about ggml is, you can always follow-reference in your IDE and
see almost immediately how things are implemented. It is small enough to be
compiled along-side, so you can step into functions, add prints, etc. If some
functionality is missing, you can quickly hack it in. Make sure to use.
functionality is missing, you can quickly hack it in. Make sure to use that.

### vision.cpp

Expand All @@ -68,7 +68,7 @@ tensor some_module(model_ref m, tensor x, ...)
Here `tensor` is short for `ggml_tensor *`, which can be a weight or the result
of an operation. The `model_ref` is used to build a compute graph by passing it
to ggml functions as replacement for `ggml_context *`. It keeps track of parent
modules and provides a way to access model weights.
modules and provides a way to access model weights by name.

`some_module` typically represents the forward function of a PyTorch
`nn.Module`. The whole model can be defined with reusable functions.
Expand Down Expand Up @@ -108,7 +108,8 @@ be converted. It's usually a good opportunity to optimize for inference, throw
away training-only stuff, maybe fuse some operations, or convert to a faster
memory layout.

If you haven't already, setup a Python environment (just running `uv sync` will do).
If you haven't already, setup a Python environment (I use
[uv](https://docs.astral.sh/uv/) and simply run `uv sync`).

Open `scripts/convert.py` and add a conversion function similar to the existing
ones. A 1:1 conversion is very simple:
Expand Down
8 changes: 6 additions & 2 deletions include/visp/image.hpp → include/visp/image.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once

#include "visp/util.hpp"
#include "visp/util.h"

#include <memory>
#include <span>
Expand Down Expand Up @@ -97,9 +97,12 @@ struct image_data {
std::unique_ptr<uint8_t[]> data;
};

// Allocate image data. Pixels are not initialized.
// Allocate image data. Memory is not initialized!
VISP_API image_data image_alloc(i32x2 extent, image_format format);

// Set all pixels to zero.
void image_clear(image_span const&);

// Load image from file (PNG, JPEG, etc.)
VISP_API image_data image_load(char const* filepath);

Expand Down Expand Up @@ -194,6 +197,7 @@ struct VISP_API tile_layout {
VISP_API tile_layout tile_scale(tile_layout const&, int scale);

// Merge a tile into the destination image. Both images must be rgb_f32 format.
// Blends pixels from `tile` and `dst` in overlap regions. `dst` must be all zeros initially.
VISP_API void tile_merge(
image_view const& tile, image_span const& dst, i32x2 tile_coord, tile_layout const& layout);

Expand Down
107 changes: 74 additions & 33 deletions include/visp/ml.hpp → include/visp/ml.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once

#include "visp/image.hpp"
#include "visp/util.hpp"
#include "visp/image.h"
#include "visp/util.h"

#include <ggml-alloc.h>
#include <ggml-backend.h>
Expand All @@ -13,6 +13,8 @@
#include <limits>
#include <memory>
#include <span>
#include <string>
#include <string_view>
#include <vector>

namespace visp {
Expand All @@ -21,64 +23,111 @@ using std::span;
using tensor_name = fixed_string<GGML_MAX_NAME>;
using tensor = ggml_tensor*;

// Memory layout, especially for weights of 2D operations like convolutions
enum tensor_data_layout { unknown, whcn, cwhn };

//
// Backend
// Backend device - represents the compute hardware

enum class backend_type { cpu = 1, gpu = 2 };

// True if the backend library is loaded and has at least one supported device.
VISP_API bool backend_is_available(backend_type);

struct VISP_API backend_device {
struct backend_device {
ggml_backend_ptr handle;
ggml_backend_dev_t device;

backend_type type() const;
ggml_type preferred_float_type() const;
size_t total_memory() const;
VISP_API backend_type type() const;
VISP_API ggml_type preferred_float_type() const;
VISP_API tensor_data_layout preferred_layout() const;
VISP_API size_t total_memory() const;

operator ggml_backend_t() const { return handle.get(); }
};

// Initialize a backend device, automatically tries to pick the "best" available.
VISP_API backend_device backend_init();

// Initialize the most suited device that matches the specified backend type.
VISP_API backend_device backend_init(backend_type);

// Set number of threads used by the backend (CPU only).
VISP_API void backend_set_n_threads(backend_device&, int n_threads);

//
// Model build flags - backend capabilities, model configuration and optimization

enum class model_build_flag {
// clang-format off
cwhn = 1 << 0,
conv_2d_direct_cwhn = 1 << 1,
concat_n = 1 << 2,
f16_conv_transpose = 1 << 3,
window_partition = 1 << 4
}; // clang-format on

using model_build_flags = flags<model_build_flag>;

VISP_API model_build_flags backend_default_flags(backend_type);

//
// Model file - holds the contents of a GGUF file

struct model_file {
gguf_context_ptr gguf;
ggml_context_ptr data;
std::string path;

VISP_API int64_t n_tensors() const;
VISP_API std::string_view arch() const;
VISP_API tensor_data_layout tensor_layout() const;

VISP_API int64_t key(char const* name) const;
VISP_API int get_int(char const* name) const;
VISP_API std::string_view get_string(char const* name) const;
};

// Opens a .gguf file and reads its contents into memory.
VISP_API model_file model_load(char const* filepath);

//
// Model weights
//
// * stores the tensor descriptors of model weights
// * holds the backend buffers for model weight data
// * holds buffers for extra tensors such as pre-computed lookup tables

struct VISP_API model_weights {
struct model_weights {
ggml_context_ptr context;
backend_type buffer_type = backend_type::cpu;
ggml_backend_buffer_ptr weights_buffer;
std::vector<ggml_backend_buffer_ptr> extra_buffers;
model_build_flags flags;

ggml_type float_type() const;
VISP_API ggml_type float_type() const;

operator ggml_context*() const { return context.get(); }
};

// Creates a GGML context with storage for a fixed number of tensors.
// Does not allocate any backend buffers.
VISP_API model_weights model_init(backend_device const&, size_t n_tensors);

struct model_load_params {
ggml_type float_type = GGML_TYPE_COUNT; // default: use type stored in GGUF file
int n_extra_tensors = 0; // number of extra tensors to allocate in the context
};

// Loads model weights from a GGUF file and transfers them to backend buffers.
VISP_API model_weights model_load(char const* filepath, backend_device const&, model_load_params = {});
VISP_API model_weights model_init(size_t n_tensors);

// Allocates backend buffers for the model weights if needed. Does not transfer data.
// Returns false and does nothing if all tensors already have an associated backend buffer.
VISP_API bool model_allocate(model_weights&, backend_device const&);

// Adds model weights contained in `file` to `weights`. Allocates backend buffers for the
// weights on `device` and transfers the data to the device buffer.
// Optionally converts float weights to the specified data type during transfer.
VISP_API void model_transfer(
model_file const& file,
model_weights& weights,
backend_device const& device,
ggml_type float_type = GGML_TYPE_COUNT,
tensor_data_layout = tensor_data_layout::unknown);

//
// Compute graph - wrapper for ggml_cgraph and its associated backend memory

Expand Down Expand Up @@ -107,18 +156,6 @@ VISP_API void compute(compute_graph const&, backend_device const&);
// to support nested modules
// * pass anywhere ggml_context* is expected while building the graph

enum class model_build_flag {
// clang-format off
cwhn = 1 << 0,
conv_2d_direct = 1 << 1,
fused_batch_norm = 1 << 2,
concat_n = 1 << 3,
f16_conv_transpose = 1 << 4,
window_partition = 1 << 5
}; // clang-format on

using model_build_flags = flags<model_build_flag>;

struct VISP_API model_ref {
ggml_context* weights_context = nullptr;
ggml_context* graph_context = nullptr;
Expand All @@ -127,8 +164,8 @@ struct VISP_API model_ref {
tensor_name prefix;

model_ref() = default;
model_ref(model_weights& m);
model_ref(model_weights& m, compute_graph& g);
model_ref(model_weights&);
model_ref(model_weights&, compute_graph&);

explicit model_ref(
ggml_context* weights_context,
Expand Down Expand Up @@ -247,7 +284,7 @@ struct swin_params {

extern swin_params const swin_t_params;
extern swin_params const swin_l_params;
VISP_API swin_params swin_detect_params(model_ref);
VISP_API swin_params swin_detect_params(model_file const&);

//
// implementation
Expand All @@ -256,4 +293,8 @@ constexpr model_build_flags operator|(model_build_flag lhs, model_build_flag rhs
return model_build_flags(uint32_t(lhs) | uint32_t(rhs));
}

constexpr model_build_flags operator~(model_build_flag f) {
return ~model_build_flags(f);
}

} // namespace visp
Loading
Loading