Acly · Acly · Oct 16, 2025 · Oct 1, 2025 · Oct 2, 2025 · Oct 7, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.28)
 
-project(vision.cpp VERSION 0.1.0 LANGUAGES CXX)
+project(vision.cpp VERSION 0.2.0 LANGUAGES CXX)
 
 option(VISP_VULKAN "Enable Vulkan support" OFF)
 option(VISP_DEV "Enable development mode" OFF)
@@ -30,7 +30,7 @@ elseif(CMAKE_BUILD_TYPE)
   endif()
 endif()
 
-# Configure address sanitizer (Clang only)
+# Configure address sanitizer
 
 if(VISP_ASAN)
   if(MSVC)

diff --git a/README.md b/README.md
@@ -12,14 +12,17 @@ Based on [ggml](https://github.com/ggml-org/ggml) similar to the [llama.cpp](htt
 
 ### Features
 
-| Model                       | Task             | Backends    |
-| :-------------------------- | :--------------- | :---------- |
-| [**MobileSAM**](#mobilesam) | Segmentation     | CPU, Vulkan |
-| [**BiRefNet**](#birefnet)   | Segmentation     | CPU, Vulkan |
-| [**MI-GAN**](#mi-gan)       | Inpainting       | CPU, Vulkan |
-| [**ESRGAN**](#real-esrgan)  | Super-resolution | CPU, Vulkan |
+| Model                                    | Task                     | Backends    |
+| :--------------------------------------- | :----------------------- | :---------- |
+| [**MobileSAM**](#mobilesam)              | Promptable segmentation  | CPU, Vulkan |
+| [**BiRefNet**](#birefnet)                | Dichotomous segmentation | CPU, Vulkan |
+| [**Depth-Anything**](#depth-anything-v2) | Depth estimation         | CPU, Vulkan |
+| [**MI-GAN**](#mi-gan)                    | Inpainting               | CPU, Vulkan |
+| [**ESRGAN**](#real-esrgan)               | Super-resolution         | CPU, Vulkan |
 | [_Implement a model [**Guide**]_](docs/model-implementation-guide.md) | | |
 
+**Backbones:** SWIN (v1), DINO (v2), TinyViT
+
 ## Get Started
 
 Get the library and executables:
@@ -92,6 +95,16 @@ vision-cli sam -m MobileSAM-F16.gguf -i input.png -p 300 200 -o mask.png --compo
 vision-cli birefnet -m BiRefNet-lite-F16.gguf -i input.png -o mask.png --composite comp.png
 ```
 
+#### Depth-Anything V2
+
+<img width="400" height="256" alt="example-depth-anything" src="https://github.com/user-attachments/assets/62bde481-b898-4c46-a298-644198716953" />
+
+[Model download](https://huggingface.co/Acly/Depth-Anything-V2-GGUF/tree/main) | [Paper (arXiv)](https://arxiv.org/abs/2406.09414) | [Repository (GitHub)](https://github.com/DepthAnything/Depth-Anything-V2) | License: Apache-2 / CC-BY-NC-4
+
+```sh
+vision-cli depth-anything -m Depth-Anything-V2-Small-F16.gguf -i input.png -o depth.png
+```
+
 #### MI-GAN
 
 <img width="400" height="256" alt="example-migan" src="https://github.com/user-attachments/assets/cadf1994-7677-4822-94e5-a2ee6c07621f" />
@@ -191,10 +204,17 @@ as other frameworks for inference speed, but with:
 
 | Model |      |      | _vision.cpp_ |  PyTorch | ONNX Runtime |
 | :---- | :--- | :--- | -----------: | -------: | -----------: |
-| Full  | cpu  | f32  |     16333 ms | 18800 ms |              |
-| Full  | gpu  | f16  |       243 ms |   140 ms |              |
+| Full  | cpu  | f32  |     16333 ms | 18290 ms |              |
+| Full  | gpu  | f16  |       208 ms |   190 ms |              |
 | Lite  | cpu  | f32  |      4505 ms | 10900 ms |      6978 ms |
-| Lite  | gpu  | f16  |        86 ms |    59 ms |              |
+| Lite  | gpu  | f16  |        85 ms |    84 ms |              |
+
+#### Depth-Anything, 518x714
+
+| Model |      |      | _vision.cpp_ | PyTorch |
+| :---- | :--- | :--- | -----------: | ------: |
+| Small | gpu  | f16  |        11 ms |   10 ms |
+| Base  | gpu  | f16  |        24 ms |   22 ms |
 
 #### MI-GAN, 512x512
 
@@ -205,7 +225,7 @@ as other frameworks for inference speed, but with:
 
 #### Setup
 
-* vision.cpp: using vision-bench, GPU via Vulkan, eg. `vision-bench -m sam -b cpu`
+* vision.cpp: using vision-bench, GPU via Vulkan, eg. `vision-bench -m sam`
 * PyTorch: v2.7.1+cu128, eager eval, GPU via CUDA, average n iterations after warm-up
 
 ## Dependencies (integrated)

diff --git a/depend/ggml b/depend/ggml
diff --git a/include/visp/image.h b/include/visp/image.h
@@ -169,6 +169,12 @@ VISP_API void image_alpha_composite(
 VISP_API image_data image_alpha_composite(
     image_view const& fg, image_view const& bg, image_view const& mask);
 
+// Rescale pixels values such that the minimum value over all pixels becomes `min` and
+// the maximum becomes `max`. Channels are processed independently.
+VISP_API void image_normalize(
+    image_view const& src, image_span const& dst, float min = 0, float max = 1);
+VISP_API image_data image_normalize(image_view const& img, float min = 0, float max = 1);
+
 // Compute root-mean-square difference between two images
 VISP_API float image_difference_rms(image_view const& a, image_view const& b);
 

diff --git a/include/visp/ml.h b/include/visp/ml.h
@@ -65,7 +65,8 @@ enum class model_build_flag {
     conv_2d_direct_cwhn = 1 << 1,
     concat_n            = 1 << 2,
     f16_conv_transpose  = 1 << 3,
-    window_partition    = 1 << 4
+    window_partition    = 1 << 4,
+    flash_attention     = 1 << 5
 }; // clang-format on
 
 using model_build_flags = flags<model_build_flag>;
@@ -87,6 +88,7 @@ struct model_file {
     VISP_API int64_t key(char const* name) const;
     VISP_API int get_int(char const* name) const;
     VISP_API std::string_view get_string(char const* name) const;
+    VISP_API void get_array(char const* name, span<int> out_values) const;
 };
 
 // Opens a .gguf file and reads its contents into memory.
@@ -216,15 +218,18 @@ struct VISP_API tensor_data {
 
     span<float> as_f32();
     span<int32_t> as_i32();
+    span<byte> as_bytes();
     span<float const> as_f32() const;
     span<int32_t const> as_i32() const;
+    span<byte const> as_bytes() const;
 };
 
 // Allocates data for a tensor in main memory, outside of context and backend buffers.
 VISP_API tensor_data tensor_alloc(tensor x);
 
 // Loads tensor data from a file storing raw numbers as binary.
 VISP_API tensor_data tensor_load(tensor x, char const* filepath);
+VISP_API void tensor_save(tensor x, char const* filepath);
 
 // Copies data to the tensor's backend buffer (which should already be allocated).
 VISP_API void transfer_to_backend(tensor_data const&);
@@ -274,28 +279,6 @@ VISP_API tensor concat(model_ref const&, std::array<tensor, GGML_MAX_SRC> src, i
 // Up- or downsample a 2D tensor (WHCN) to target width x height.
 VISP_API tensor interpolate(model_ref const&, tensor x, i64x2 target, int32_t mode);
 
-//
-// SWIN Transformer
-
-struct swin_layer_t {
-    int depth;
-    int n_heads;
-    int n_features;
-    bool downsample;
-};
-
-struct swin_params {
-    static constexpr int n_layers = 4;
-
-    int embed_dim;
-    int window_size;
-    std::array<swin_layer_t, n_layers> layers;
-};
-
-extern swin_params const swin_t_params;
-extern swin_params const swin_l_params;
-VISP_API swin_params swin_detect_params(model_file const&);
-
 //
 // implementation
 

diff --git a/include/visp/vision.h b/include/visp/vision.h
@@ -57,8 +57,9 @@
 // 7. Run the compute graph.
 // 8. Transfer the output to the host and post-process it.
 //
-// Custom pipelines are simply functions which call the individual steps and extend them
-// where needed. The implementation of the high-level API functions is a good starting point.
+// Custom pipelines can be created simply by writing a function that calls the
+// individual steps. As a starting point, check out or copy the implementation
+// of the high-level API functions. Then adapt them as needed.
 // This allows to:
 // * load model weights from a different source
 // * control exactly when allocation happens
@@ -76,9 +77,46 @@
 
 #include <array>
 #include <span>
+#include <vector>
 
 namespace visp {
 
+// SWIN v1 - vision transformer for feature extraction
+
+constexpr int swin_n_layers = 4;
+
+struct swin_layer_t {
+    int depth;
+    int n_heads;
+    int n_features;
+};
+
+struct swin_params {
+    int embed_dim;
+    int window_size;
+    std::array<swin_layer_t, swin_n_layers> layers;
+};
+
+using swin_buffers = std::array<tensor_data, swin_n_layers + 2>;
+using swin_result = std::array<tensor, swin_n_layers>;
+
+VISP_API swin_params swin_detect_params(model_file const&);
+VISP_API swin_buffers swin_precompute(model_ref, i32x2 image_extent, swin_params const&);
+VISP_API swin_result swin_encode(model_ref, tensor image, swin_params const&);
+
+// DINO v2 - vision transformer for feature extraction
+
+struct dino_params {
+    int patch_size = 16;
+    int embed_dim = 768;
+    int n_layers = 12;
+    int n_heads = 12;
+};
+
+VISP_API dino_params dino_detect_params(model_file const&);
+VISP_API std::vector<tensor> dino_get_intermediate_layers(
+    model_ref, tensor image, span<int const> layers_ids, dino_params const&);
+
 //
 // Mobile SAM - image segmentation with prompt (point or box)
 
@@ -133,7 +171,9 @@ VISP_API image_data sam_process_mask(
 struct birefnet_model;
 
 // Loads a BiRefNet model from GGUF file onto the backend device.
-// * supports BiRefNet, BiRefNet_lite, BiRefNet_Matting variants at 1024px resolution
+// * supports BiRefNet, BiRefNet-lite, BiRefNet-Matting variants at 1024px resolution
+// * supports BiRefNet-HR variant at 2048px resolution
+// * supports BiRefNet-dynamic variant at arbitrary resolution
 VISP_API birefnet_model birefnet_load_model(char const* filepath, backend_device const&);
 
 // Takes RGB input and computes an alpha mask with foreground as 1.0 and background as 0.0.
@@ -148,7 +188,7 @@ struct birefnet_params {
     swin_params encoder;
 };
 
-using birefnet_buffers = std::array<tensor_data, swin_params::n_layers + 2>;
+using birefnet_buffers = swin_buffers;
 
 VISP_API birefnet_params birefnet_detect_params(
     model_file const&, i32x2 dynamic_extent = {}, size_t max_alloc = SIZE_MAX);
@@ -162,6 +202,39 @@ VISP_API image_data birefnet_process_output(
 
 VISP_API tensor birefnet_predict(model_ref, tensor image, birefnet_params const&);
 
+//
+// Depth Anything - depth estimation
+
+struct depthany_model;
+
+// Loads a Depth Anything V2 model from GGUF file onto the backend device.
+// * supports Small/Base/Large variants with flexible input resolution
+VISP_API depthany_model depthany_load_model(char const* filepath, backend_device const&);
+
+// Takes RGB input and computes estimated depth (distance from camera).
+// Output is a single-channel float32 image in range [0, 1.0].
+VISP_API image_data depthany_compute(depthany_model&, image_view image);
+
+// --- Depth Anything pipeline
+
+struct depthany_params {
+    int image_size = 518;
+    int image_multiple = 14;
+    i32x2 image_extent = {518, 518};
+    float max_depth = 1;
+    std::array<int, 4> feature_layers = {2, 5, 8, 11};
+    dino_params dino;
+};
+
+VISP_API depthany_params depthany_detect_params(model_file const&, i32x2 input_extent = {});
+VISP_API i32x2 depthany_image_extent(i32x2 input_extent, depthany_params const&);
+
+VISP_API image_data depthany_process_input(image_view image, depthany_params const&);
+image_data depthany_process_output(
+    std::span<float const> output_data, i32x2 target_extent, depthany_params const&);
+
+VISP_API tensor depthany_predict(model_ref, tensor image, depthany_params const&);
+
 //
 // MI-GAN - image inpainting
 
@@ -246,6 +319,17 @@ struct birefnet_model {
     tensor output = nullptr;
 };
 
+// internal
+struct depthany_model {
+    backend_device const* backend = nullptr;
+    model_weights weights;
+    depthany_params params;
+
+    compute_graph graph;
+    tensor input = nullptr;
+    tensor output = nullptr;
+};
+
 // internal
 struct migan_model {
     backend_device const* backend = nullptr;

diff --git a/models/CMakeLists.txt b/models/CMakeLists.txt
@@ -14,6 +14,13 @@ file(DOWNLOAD
   EXPECTED_HASH "SHA256=7b5397a2c98d66677f8f74317774bbeac49dbb321b8a3dc744af913db71d4fa5"
   SHOW_PROGRESS
 )
+message(STATUS "Checking for models/Depth-Anything-V2-Small-F16.gguf")
+file(DOWNLOAD
+  "https://huggingface.co/Acly/Depth-Anything-V2-GGUF/resolve/main/Depth-Anything-V2-Small-F16.gguf"
+  ${CMAKE_CURRENT_LIST_DIR}/Depth-Anything-V2-Small-F16.gguf
+  EXPECTED_HASH "SHA256=0f83332d6a8b4375cd7fdcc168f3e3636f474f8e84b0959e903f513aace782f5"
+  SHOW_PROGRESS
+)
 message(STATUS "Checking for models/MIGAN-512-places2-F16.gguf")
 file(DOWNLOAD
   "https://huggingface.co/Acly/MIGAN-GGUF/resolve/main/MIGAN-512-places2-F16.gguf"