depth-anything: documentation, readme, fix license for base/large models

Acly · Acly · commit 195f2f08d6f2 · 2025-10-16T11:41:09.000+02:00
diff --git a/README.md b/README.md
@@ -12,14 +12,17 @@ Based on [ggml](https://github.com/ggml-org/ggml) similar to the [llama.cpp](htt
 
 ### Features
 
-| Model                       | Task             | Backends    |
-| :-------------------------- | :--------------- | :---------- |
-| [**MobileSAM**](#mobilesam) | Segmentation     | CPU, Vulkan |
-| [**BiRefNet**](#birefnet)   | Segmentation     | CPU, Vulkan |
-| [**MI-GAN**](#mi-gan)       | Inpainting       | CPU, Vulkan |
-| [**ESRGAN**](#real-esrgan)  | Super-resolution | CPU, Vulkan |
+| Model                                    | Task                     | Backends    |
+| :--------------------------------------- | :----------------------- | :---------- |
+| [**MobileSAM**](#mobilesam)              | Promptable segmentation  | CPU, Vulkan |
+| [**BiRefNet**](#birefnet)                | Dichotomous segmentation | CPU, Vulkan |
+| [**Depth-Anything**](#depth-anything-v2) | Depth estimation         | CPU, Vulkan |
+| [**MI-GAN**](#mi-gan)                    | Inpainting               | CPU, Vulkan |
+| [**ESRGAN**](#real-esrgan)               | Super-resolution         | CPU, Vulkan |
 | [_Implement a model [**Guide**]_](docs/model-implementation-guide.md) | | |
 
+**Backbones:** SWIN (v1), DINO (v2), TinyViT
+
 ## Get Started
 
 Get the library and executables:
@@ -92,6 +95,16 @@ vision-cli sam -m MobileSAM-F16.gguf -i input.png -p 300 200 -o mask.png --compo
 vision-cli birefnet -m BiRefNet-lite-F16.gguf -i input.png -o mask.png --composite comp.png
 ```
 
+#### Depth-Anything V2
+
+<img width="400" height="256" alt="example-depth-anything" src="" />
+
+[Model download](https://huggingface.co/Acly/Depth-Anything-GGUF/tree/main) | [Paper (arXiv)](https://arxiv.org/abs/2406.09414) | [Repository (GitHub)](https://github.com/DepthAnything/Depth-Anything-V2) | License: Apache-2 / CC-BY-NC-4
+
+```sh
+vision-cli depth-anything -m Depth-Anything-V2-Small-F16.gguf -i input.png -o depth.png
+```
+
 #### MI-GAN
 
 <img width="400" height="256" alt="example-migan" src="https://github.com/user-attachments/assets/cadf1994-7677-4822-94e5-a2ee6c07621f" />
diff --git a/include/visp/vision.h b/include/visp/vision.h
@@ -57,8 +57,9 @@
 // 7. Run the compute graph.
 // 8. Transfer the output to the host and post-process it.
 //
-// Custom pipelines are simply functions which call the individual steps and extend them
-// where needed. The implementation of the high-level API functions is a good starting point.
+// Custom pipelines can be created simply by writing a function that calls the
+// individual steps. As a starting point, check out or copy the implementation
+// of the high-level API functions. Then adapt them as needed.
 // This allows to:
 // * load model weights from a different source
 // * control exactly when allocation happens
@@ -76,10 +77,11 @@
 
 #include <array>
 #include <span>
+#include <vector>
 
 namespace visp {
 
-// SWIN - vision transformer for feature extraction
+// SWIN v1 - vision transformer for feature extraction
 
 constexpr int swin_n_layers = 4;
 
@@ -102,7 +104,7 @@ VISP_API swin_params swin_detect_params(model_file const&);
 VISP_API swin_buffers swin_precompute(model_ref, i32x2 image_extent, swin_params const&);
 VISP_API swin_result swin_encode(model_ref, tensor image, swin_params const&);
 
-// DINO - vision transformer for feature extraction
+// DINO v2 - vision transformer for feature extraction
 
 struct dino_params {
     int patch_size = 16;
@@ -169,7 +171,9 @@ VISP_API image_data sam_process_mask(
 struct birefnet_model;
 
 // Loads a BiRefNet model from GGUF file onto the backend device.
-// * supports BiRefNet, BiRefNet_lite, BiRefNet_Matting variants at 1024px resolution
+// * supports BiRefNet, BiRefNet-lite, BiRefNet-Matting variants at 1024px resolution
+// * supports BiRefNet-HR variant at 2048px resolution
+// * supports BiRefNet-dynamic variant at arbitrary resolution
 VISP_API birefnet_model birefnet_load_model(char const* filepath, backend_device const&);
 
 // Takes RGB input and computes an alpha mask with foreground as 1.0 and background as 0.0.
@@ -203,7 +207,12 @@ VISP_API tensor birefnet_predict(model_ref, tensor image, birefnet_params const&
 
 struct depthany_model;
 
+// Loads a Depth Anything V2 model from GGUF file onto the backend device.
+// * supports Small/Base/Large variants with flexible input resolution
 VISP_API depthany_model depthany_load_model(char const* filepath, backend_device const&);
+
+// Takes RGB input and computes estimated depth (distance from camera).
+// Output is a single-channel float32 image in range [0, 1.0].
 VISP_API image_data depthany_compute(depthany_model&, image_view image);
 
 // --- Depth Anything pipeline
@@ -222,7 +231,7 @@ VISP_API i32x2 depthany_image_extent(i32x2 input_extent, depthany_params const&)
 
 VISP_API image_data depthany_process_input(image_view image, depthany_params const&);
 image_data depthany_process_output(
-    span<float const> output_data, i32x2 target_extent, depthany_params const&);
+    std::span<float const> output_data, i32x2 target_extent, depthany_params const&);
 
 VISP_API tensor depthany_predict(model_ref, tensor image, depthany_params const&);
 
diff --git a/models/CMakeLists.txt b/models/CMakeLists.txt
@@ -14,6 +14,13 @@ file(DOWNLOAD
   EXPECTED_HASH "SHA256=7b5397a2c98d66677f8f74317774bbeac49dbb321b8a3dc744af913db71d4fa5"
   SHOW_PROGRESS
 )
+message(STATUS "Checking for models/Depth-Anything-V2-Small-F16.gguf")
+file(DOWNLOAD
+  "https://huggingface.co/Acly/Depth-Anything-V2-GGUF/resolve/main/Depth-Anything-V2-Small-F16.gguf"
+  ${CMAKE_CURRENT_LIST_DIR}/Depth-Anything-V2-Small-F16.gguf
+  EXPECTED_HASH "SHA256=0f83332d6a8b4375cd7fdcc168f3e3636f474f8e84b0959e903f513aace782f5"
+  SHOW_PROGRESS
+)
 message(STATUS "Checking for models/MIGAN-512-places2-F16.gguf")
 file(DOWNLOAD
   "https://huggingface.co/Acly/MIGAN-GGUF/resolve/main/MIGAN-512-places2-F16.gguf"
diff --git a/scripts/convert.py b/scripts/convert.py
@@ -354,7 +354,10 @@ def convert_birefnet(input_filepath: Path, writer: Writer):
 
 
 def convert_depth_anything(input_filepath: Path, writer: Writer):
-    writer.add_license("apache-2.0")
+    if "small" in input_filepath.name.lower():
+        writer.add_license("apache-2.0")
+    else:
+        writer.add_license("cc-by-nc-4.0")
     writer.set_tensor_layout_default(TensorLayout.nchw)
 
     model: dict[str, Tensor] = load_model(input_filepath)