diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 631420d..c8573aa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,6 +37,13 @@ jobs: if: matrix.os == 'windows-latest' uses: microsoft/setup-msbuild@v2 + - name: Vulkan SDK (Windows) + if: matrix.os == 'windows-latest' + uses: humbletim/install-vulkan-sdk@v1.2 + with: + version: 1.4.309.0 + cache: true + - name: Configure (Linux) if: matrix.os == 'ubuntu-22.04' run: > @@ -52,6 +59,7 @@ jobs: cmake . -B build -A x64 -D CMAKE_BUILD_TYPE=Release -D VISP_CI=ON + -D VISP_VULKAN=ON - name: Configure (MacOS) if: matrix.os == 'macos-14' @@ -74,8 +82,7 @@ jobs: # export GGML_VK_VISIBLE_DEVICES=0 # ctest --verbose - - name: Test CPU - if: matrix.os != 'ubuntu-22.04' + - name: Test working-directory: ./build run: ctest --verbose -C Release diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e462d1..3e5493f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,8 @@ if(PROJECT_IS_TOP_LEVEL) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) endif() +# Configure assertions + if(VISP_DEV) set(VISP_ASSERT "VISP_ASSERT_BREAK") elseif(VISP_CI) @@ -28,6 +30,8 @@ elseif(CMAKE_BUILD_TYPE) endif() endif() +# Configure address sanitizer (Clang only) + if(VISP_ASAN) if(MSVC) add_compile_options(/fsanitize=address) @@ -38,12 +42,20 @@ if(VISP_ASAN) endif() endif() -if(MSVC) - add_compile_options(/Zi /utf-8) - add_compile_definitions(_CRT_SECURE_NO_WARNINGS) - add_link_options(/DEBUG) # Enable debug symbols also in release builds +# Windows/MSVC specific defaults + +if(MSVC) + list(APPEND VISP_COMP_OPTIONS /utf-8) + list(APPEND VISP_DEFINITIONS _CRT_SECURE_NO_WARNINGS) + if(PROJECT_IS_TOP_LEVEL) + # Enable debug symbols also in release builds + list(APPEND VISP_COMP_OPTIONS /Zi) + list(APPEND VISP_LINK_OPTIONS /DEBUG) + endif() endif() +# Configure warnings + if(VISP_DEV OR VISP_CI) if(MSVC) set(VISP_WARNINGS /W4 /WX /wd4251) @@ -59,7 +71,7 @@ add_subdirectory(depend/stb) if(VISP_FMT_LIB) add_subdirectory(depend/fmt) set(VISP_FMT_LINK fmt::fmt) - set(VISP_FMT_DEFS VISP_FMT_LIB) + list(APPEND VISP_DEFINITIONS VISP_FMT_LIB) endif() set(GGML_VULKAN ${VISP_VULKAN}) diff --git a/README.md b/README.md index db180fa..a89a3df 100644 --- a/README.md +++ b/README.md @@ -48,11 +48,11 @@ Pass `--composite output.png` to composite input and mask. Use `--help` for more #### API ```c++ -#include +#include using namespace visp; void main() { - backend cpu = backend_init(backend_type::cpu); + backend_device cpu = backend_init(backend_type::cpu); sam_model sam = sam_load_model("MobileSAM-F16.gguf", cpu); image_data input_image = image_load("input.jpg"); @@ -180,32 +180,32 @@ as other frameworks for inference speed, but with: * CPU: AMD Ryzen 5 5600X (6 cores) * GPU: NVIDIA GeForce RTX 4070 -#### MobileSAM, 1024x1024, encode + decode +#### MobileSAM, 1024x1024 -| | | _vision.cpp_ | PyTorch | ONNX Runtime | -| :--- | :--- | -----------: | ----------: | -----------: | -| cpu | f32 | 632 + 37 ms | 559 + 42 ms | 728 + 87 ms | -| gpu | f16 | 18 + 3 ms | 10 + 6 ms | | +| | | _vision.cpp_ | PyTorch | ONNX Runtime | +| :--- | :--- | -----------: | ------: | -----------: | +| cpu | f32 | 669 ms | 601 ms | 805 ms | +| gpu | f16 | 19 ms | 16 ms | | #### BiRefNet, 1024x1024 | Model | | | _vision.cpp_ | PyTorch | ONNX Runtime | | :---- | :--- | :--- | -----------: | -------: | -----------: | | Full | cpu | f32 | 16333 ms | 18800 ms | | -| Full | gpu | f16 | 380 ms | 140 ms | | +| Full | gpu | f16 | 243 ms | 140 ms | | | Lite | cpu | f32 | 4505 ms | 10900 ms | 6978 ms | -| Lite | gpu | f16 | 204 ms | 59 ms | 967 ms | +| Lite | gpu | f16 | 86 ms | 59 ms | | #### MI-GAN, 512x512 | Model | | | _vision.cpp_ | PyTorch | | :---------- | :--- | :--- | -----------: | ------: | | 512-places2 | cpu | f32 | 523 ms | 637 ms | -| 512-places2 | gpu | f16 | 24 ms | 17 ms | +| 512-places2 | gpu | f16 | 21 ms | 17 ms | #### Setup -* vision.cpp: using vision-bench, GPU via Vulkan, eg. `vision-bench sam cpu` +* vision.cpp: using vision-bench, GPU via Vulkan, eg. `vision-bench -m sam -b cpu` * PyTorch: v2.7.1+cu128, eager eval, GPU via CUDA, average n iterations after warm-up ## Dependencies (integrated) diff --git a/depend/ggml b/depend/ggml index 41982e7..f77c43a 160000 --- a/depend/ggml +++ b/depend/ggml @@ -1 +1 @@ -Subproject commit 41982e7b5985250c9322bdbdde0ab91bfd4e27f7 +Subproject commit f77c43aadfd9a552bbd2c2c5160e8caf85fe0288 diff --git a/docs/model-implementation-guide.md b/docs/model-implementation-guide.md index 3ea9c17..698f83d 100644 --- a/docs/model-implementation-guide.md +++ b/docs/model-implementation-guide.md @@ -51,7 +51,7 @@ PyTorch code. The great thing about ggml is, you can always follow-reference in your IDE and see almost immediately how things are implemented. It is small enough to be compiled along-side, so you can step into functions, add prints, etc. If some -functionality is missing, you can quickly hack it in. Make sure to use. +functionality is missing, you can quickly hack it in. Make sure to use that. ### vision.cpp @@ -68,7 +68,7 @@ tensor some_module(model_ref m, tensor x, ...) Here `tensor` is short for `ggml_tensor *`, which can be a weight or the result of an operation. The `model_ref` is used to build a compute graph by passing it to ggml functions as replacement for `ggml_context *`. It keeps track of parent -modules and provides a way to access model weights. +modules and provides a way to access model weights by name. `some_module` typically represents the forward function of a PyTorch `nn.Module`. The whole model can be defined with reusable functions. @@ -108,7 +108,8 @@ be converted. It's usually a good opportunity to optimize for inference, throw away training-only stuff, maybe fuse some operations, or convert to a faster memory layout. -If you haven't already, setup a Python environment (just running `uv sync` will do). +If you haven't already, setup a Python environment (I use +[uv](https://docs.astral.sh/uv/) and simply run `uv sync`). Open `scripts/convert.py` and add a conversion function similar to the existing ones. A 1:1 conversion is very simple: diff --git a/include/visp/image.hpp b/include/visp/image.h similarity index 96% rename from include/visp/image.hpp rename to include/visp/image.h index 234bf94..a2b01fd 100644 --- a/include/visp/image.hpp +++ b/include/visp/image.h @@ -1,6 +1,6 @@ #pragma once -#include "visp/util.hpp" +#include "visp/util.h" #include #include @@ -97,9 +97,12 @@ struct image_data { std::unique_ptr data; }; -// Allocate image data. Pixels are not initialized. +// Allocate image data. Memory is not initialized! VISP_API image_data image_alloc(i32x2 extent, image_format format); +// Set all pixels to zero. +void image_clear(image_span const&); + // Load image from file (PNG, JPEG, etc.) VISP_API image_data image_load(char const* filepath); @@ -194,6 +197,7 @@ struct VISP_API tile_layout { VISP_API tile_layout tile_scale(tile_layout const&, int scale); // Merge a tile into the destination image. Both images must be rgb_f32 format. +// Blends pixels from `tile` and `dst` in overlap regions. `dst` must be all zeros initially. VISP_API void tile_merge( image_view const& tile, image_span const& dst, i32x2 tile_coord, tile_layout const& layout); diff --git a/include/visp/ml.hpp b/include/visp/ml.h similarity index 75% rename from include/visp/ml.hpp rename to include/visp/ml.h index b0290d3..2a3826d 100644 --- a/include/visp/ml.hpp +++ b/include/visp/ml.h @@ -1,7 +1,7 @@ #pragma once -#include "visp/image.hpp" -#include "visp/util.hpp" +#include "visp/image.h" +#include "visp/util.h" #include #include @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include namespace visp { @@ -21,30 +23,74 @@ using std::span; using tensor_name = fixed_string; using tensor = ggml_tensor*; +// Memory layout, especially for weights of 2D operations like convolutions +enum tensor_data_layout { unknown, whcn, cwhn }; + // -// Backend +// Backend device - represents the compute hardware enum class backend_type { cpu = 1, gpu = 2 }; // True if the backend library is loaded and has at least one supported device. VISP_API bool backend_is_available(backend_type); -struct VISP_API backend_device { +struct backend_device { ggml_backend_ptr handle; ggml_backend_dev_t device; - backend_type type() const; - ggml_type preferred_float_type() const; - size_t total_memory() const; + VISP_API backend_type type() const; + VISP_API ggml_type preferred_float_type() const; + VISP_API tensor_data_layout preferred_layout() const; + VISP_API size_t total_memory() const; operator ggml_backend_t() const { return handle.get(); } }; +// Initialize a backend device, automatically tries to pick the "best" available. VISP_API backend_device backend_init(); + +// Initialize the most suited device that matches the specified backend type. VISP_API backend_device backend_init(backend_type); +// Set number of threads used by the backend (CPU only). VISP_API void backend_set_n_threads(backend_device&, int n_threads); +// +// Model build flags - backend capabilities, model configuration and optimization + +enum class model_build_flag { + // clang-format off + cwhn = 1 << 0, + conv_2d_direct_cwhn = 1 << 1, + concat_n = 1 << 2, + f16_conv_transpose = 1 << 3, + window_partition = 1 << 4 +}; // clang-format on + +using model_build_flags = flags; + +VISP_API model_build_flags backend_default_flags(backend_type); + +// +// Model file - holds the contents of a GGUF file + +struct model_file { + gguf_context_ptr gguf; + ggml_context_ptr data; + std::string path; + + VISP_API int64_t n_tensors() const; + VISP_API std::string_view arch() const; + VISP_API tensor_data_layout tensor_layout() const; + + VISP_API int64_t key(char const* name) const; + VISP_API int get_int(char const* name) const; + VISP_API std::string_view get_string(char const* name) const; +}; + +// Opens a .gguf file and reads its contents into memory. +VISP_API model_file model_load(char const* filepath); + // // Model weights // @@ -52,33 +98,36 @@ VISP_API void backend_set_n_threads(backend_device&, int n_threads); // * holds the backend buffers for model weight data // * holds buffers for extra tensors such as pre-computed lookup tables -struct VISP_API model_weights { +struct model_weights { ggml_context_ptr context; backend_type buffer_type = backend_type::cpu; ggml_backend_buffer_ptr weights_buffer; std::vector extra_buffers; + model_build_flags flags; - ggml_type float_type() const; + VISP_API ggml_type float_type() const; operator ggml_context*() const { return context.get(); } }; // Creates a GGML context with storage for a fixed number of tensors. // Does not allocate any backend buffers. -VISP_API model_weights model_init(backend_device const&, size_t n_tensors); - -struct model_load_params { - ggml_type float_type = GGML_TYPE_COUNT; // default: use type stored in GGUF file - int n_extra_tensors = 0; // number of extra tensors to allocate in the context -}; - -// Loads model weights from a GGUF file and transfers them to backend buffers. -VISP_API model_weights model_load(char const* filepath, backend_device const&, model_load_params = {}); +VISP_API model_weights model_init(size_t n_tensors); // Allocates backend buffers for the model weights if needed. Does not transfer data. // Returns false and does nothing if all tensors already have an associated backend buffer. VISP_API bool model_allocate(model_weights&, backend_device const&); +// Adds model weights contained in `file` to `weights`. Allocates backend buffers for the +// weights on `device` and transfers the data to the device buffer. +// Optionally converts float weights to the specified data type during transfer. +VISP_API void model_transfer( + model_file const& file, + model_weights& weights, + backend_device const& device, + ggml_type float_type = GGML_TYPE_COUNT, + tensor_data_layout = tensor_data_layout::unknown); + // // Compute graph - wrapper for ggml_cgraph and its associated backend memory @@ -107,18 +156,6 @@ VISP_API void compute(compute_graph const&, backend_device const&); // to support nested modules // * pass anywhere ggml_context* is expected while building the graph -enum class model_build_flag { - // clang-format off - cwhn = 1 << 0, - conv_2d_direct = 1 << 1, - fused_batch_norm = 1 << 2, - concat_n = 1 << 3, - f16_conv_transpose = 1 << 4, - window_partition = 1 << 5 -}; // clang-format on - -using model_build_flags = flags; - struct VISP_API model_ref { ggml_context* weights_context = nullptr; ggml_context* graph_context = nullptr; @@ -127,8 +164,8 @@ struct VISP_API model_ref { tensor_name prefix; model_ref() = default; - model_ref(model_weights& m); - model_ref(model_weights& m, compute_graph& g); + model_ref(model_weights&); + model_ref(model_weights&, compute_graph&); explicit model_ref( ggml_context* weights_context, @@ -247,7 +284,7 @@ struct swin_params { extern swin_params const swin_t_params; extern swin_params const swin_l_params; -VISP_API swin_params swin_detect_params(model_ref); +VISP_API swin_params swin_detect_params(model_file const&); // // implementation @@ -256,4 +293,8 @@ constexpr model_build_flags operator|(model_build_flag lhs, model_build_flag rhs return model_build_flags(uint32_t(lhs) | uint32_t(rhs)); } +constexpr model_build_flags operator~(model_build_flag f) { + return ~model_build_flags(f); +} + } // namespace visp diff --git a/include/visp/util.hpp b/include/visp/util.h similarity index 83% rename from include/visp/util.hpp rename to include/visp/util.h index 99c008f..74e4826 100644 --- a/include/visp/util.hpp +++ b/include/visp/util.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -138,25 +139,23 @@ struct flags { explicit constexpr flags(uint32_t value) : value(value) {} flags& operator|=(E other) { - value |= other; + value |= uint32_t(other); return *this; } - - friend constexpr bool operator&(flags lhs, E rhs) { - return (lhs.value & uint32_t(rhs)) != 0; + + flags& operator|=(flags other) { + value |= other.value; + return *this; } - friend constexpr bool operator&(flags lhs, flags rhs) { - return (lhs.value & rhs.value) != 0; - } + constexpr flags operator~() const { return flags(~value); } + explicit constexpr operator bool() const { return value != 0; } - friend constexpr flags operator|(flags lhs, E rhs) { - return flags(lhs.value | uint32_t(rhs)); - } + friend constexpr flags operator&(flags lhs, E rhs) { return flags(lhs.value & uint32_t(rhs)); } + friend constexpr flags operator&(flags lhs, flags rhs) { return flags(lhs.value & rhs.value); } - friend constexpr flags operator|(flags lhs, flags rhs) { - return flags(lhs.value | rhs.value); - } + friend constexpr flags operator|(flags lhs, E rhs) { return flags(lhs.value | uint32_t(rhs)); } + friend constexpr flags operator|(flags lhs, flags rhs) { return flags(lhs.value | rhs.value); } }; } // namespace visp diff --git a/include/visp/vision.hpp b/include/visp/vision.h similarity index 93% rename from include/visp/vision.hpp rename to include/visp/vision.h index a85b178..8fa4211 100644 --- a/include/visp/vision.hpp +++ b/include/visp/vision.h @@ -8,21 +8,21 @@ // // Vision.cpp comes in 3 main headers: // -// visp/image.hpp +// visp/image.h // // Defines structures to store and reference pixel data. Supports loading, saving and // common processing of images. Most tasks take an `image_view` as input, which // is a non-owning reference to external pixel data. Output is returned as // `image_data` (allocated by the library) or written to an `image_span`. // -// visp/ml.hpp +// visp/ml.h // // Contains ML infrastructure shared between all models: loading weights, // transferring data between backend devices (eg. GPU), and executing // compute graphs. Most of these are thin convenience wrappers around GGML. // Alternatively you can use GGML directly for greater flexibility. // -// visp/vision.hpp (this file) +// visp/vision.h (this file) // // Provides a high-level API to run inference on various vision models for // common tasks. These operations are built for simplicity and don't provide @@ -70,9 +70,9 @@ #pragma once -#include "visp/image.hpp" -#include "visp/ml.hpp" -#include "visp/util.hpp" +#include "visp/image.h" +#include "visp/ml.h" +#include "visp/util.h" #include #include @@ -142,14 +142,17 @@ VISP_API image_data birefnet_compute(birefnet_model&, image_view image); // --- BiRefNet pipeline struct birefnet_params { - int image_size = 1024; + int image_size = 1024; // can be -1 for dynamic size + int image_multiple = 32; + i32x2 image_extent = {1024, 1024}; // required if image_size is -1 swin_params encoder; }; using birefnet_buffers = std::array; -VISP_API birefnet_params birefnet_detect_params(model_ref); +VISP_API birefnet_params birefnet_detect_params(model_file const&, i32x2 dynamic_extent = {}); VISP_API birefnet_buffers birefnet_precompute(model_ref, birefnet_params const&); +VISP_API i32x2 birefnet_image_extent(i32x2 input_extent, birefnet_params const&); VISP_API image_data birefnet_process_input(image_view, birefnet_params const&); VISP_API image_data birefnet_process_output( @@ -176,7 +179,7 @@ struct migan_params { bool invert_mask = false; }; -VISP_API migan_params migan_detect_params(model_ref m); +VISP_API migan_params migan_detect_params(model_file const&); VISP_API image_data migan_process_input(image_view image, image_view mask, migan_params const&); VISP_API image_data migan_process_output( @@ -204,7 +207,7 @@ struct esrgan_params { int n_blocks = 23; }; -VISP_API esrgan_params esrgan_detect_params(model_ref); +VISP_API esrgan_params esrgan_detect_params(model_file const&); VISP_API int esrgan_estimate_graph_size(esrgan_params const&); VISP_API tensor esrgan_generate(model_ref, tensor image, esrgan_params const&); diff --git a/models/CMakeLists.txt b/models/CMakeLists.txt index 7e72a01..d1afb96 100644 --- a/models/CMakeLists.txt +++ b/models/CMakeLists.txt @@ -4,27 +4,27 @@ message(STATUS "Checking for models/MobileSAM-F16.gguf") file(DOWNLOAD "https://huggingface.co/Acly/MobileSAM-GGUF/resolve/main/MobileSAM-F16.gguf" ${CMAKE_CURRENT_LIST_DIR}/MobileSAM-F16.gguf - EXPECTED_HASH "SHA256=1e392f58a0e518b7e1e9e5a43403ff0c6d001aeefa6f4e4d2bdf60f7bbe6e4f2" + EXPECTED_HASH "SHA256=b546366475e3ad744bb2eaf7634df88e9aaf25f6622797d2de300f5a530831f7" SHOW_PROGRESS ) message(STATUS "Checking for models/BiRefNet-lite-F16.gguf") file(DOWNLOAD "https://huggingface.co/Acly/BiRefNet-GGUF/resolve/main/BiRefNet-lite-F16.gguf" ${CMAKE_CURRENT_LIST_DIR}/BiRefNet-lite-F16.gguf - EXPECTED_HASH "SHA256=f038843ea7c44a859491df96c7b36815143f7de77b13cbfc0dae5f6eae863fb5" + EXPECTED_HASH "SHA256=7b5397a2c98d66677f8f74317774bbeac49dbb321b8a3dc744af913db71d4fa5" SHOW_PROGRESS ) message(STATUS "Checking for models/MIGAN-512-places2-F16.gguf") file(DOWNLOAD "https://huggingface.co/Acly/MIGAN-GGUF/resolve/main/MIGAN-512-places2-F16.gguf" ${CMAKE_CURRENT_LIST_DIR}/MIGAN-512-places2-F16.gguf - EXPECTED_HASH "SHA256=c9f241e96fb5a791f9494fc7d4c2dd793297ae95f05b8423f547d19bea465b81" + EXPECTED_HASH "SHA256=3e47592bf716d0dc306f8dc02d4476cfcdaf2c055fa3c3c8e0ced4db775eb64b" SHOW_PROGRESS ) message(STATUS "Checking for models/RealESRGAN-x4plus_anime-6B-F16.gguf") file(DOWNLOAD "https://huggingface.co/Acly/Real-ESRGAN-GGUF/resolve/main/RealESRGAN-x4plus_anime-6B-F16.gguf" ${CMAKE_CURRENT_LIST_DIR}/RealESRGAN-x4plus_anime-6B-F16.gguf - EXPECTED_HASH "SHA256=b741e68720d7ad6251dee2120bf7579ef816ea16da18299b39f6cbcb0e13ecf0" + EXPECTED_HASH "SHA256=730469c5a2269cdef96d0d58aacf87bcf25d7a0d92256685808e6cdce0675c09" SHOW_PROGRESS ) \ No newline at end of file diff --git a/scripts/convert.py b/scripts/convert.py index 9bc6a7c..054bf42 100644 --- a/scripts/convert.py +++ b/scripts/convert.py @@ -19,6 +19,7 @@ import safetensors import numpy as np +from enum import Enum from pathlib import Path from gguf import GGUFWriter, Metadata, GGML_QUANT_VERSION from torch import Tensor @@ -27,11 +28,29 @@ # Common +class TensorLayout(Enum): + unknown = "unknown" + nchw = "whcn" + nhwc = "cwhn" + + @staticmethod + def parse(s: str): + if s == "whcn" or s == "nchw": + return TensorLayout.nchw + if s == "cwhn" or s == "nhwc": + return TensorLayout.nhwc + return TensorLayout.unknown + + class Writer(GGUFWriter): def __init__(self, path: Path, arch_name: str, float_type: str, verbose: bool): super().__init__(path, arch_name) + self.arch = arch_name self.float_type = float_type + self.tensor_layout = TensorLayout.unknown self.verbose = verbose + self.conv2d_weights: list[int] = [] + self._index = 0 def add_tensor(self, name: str, tensor: Tensor, float_type: str | None = None): if len(name) >= 64: @@ -45,6 +64,33 @@ def add_tensor(self, name: str, tensor: Tensor, float_type: str | None = None): if self.verbose: print(name, tensor.shape, tensor_data.dtype) super().add_tensor(name, tensor_data) + self._index += 1 + + def convert_tensor_2d(self, tensor: Tensor): + # assume tensor is NCHW layout (PyTorch default) + if self.tensor_layout is TensorLayout.nhwc: + return conv_2d_to_nhwc(tensor) + else: + # add tensor index to list to optionally convert layout on the fly later + self.conv2d_weights.append(self._index) + return tensor + + def add_int32(self, name: str, value: int): + print("*", name, "=", value) + super().add_int32(name, value) + + def set_tensor_layout(self, layout: TensorLayout): + print("*", f"{self.arch}.tensor_data_layout", "=", layout.value) + self.tensor_layout = layout + self.add_tensor_data_layout(layout.value) + + def set_tensor_layout_default(self, layout: TensorLayout): + if self.tensor_layout is TensorLayout.unknown: + self.set_tensor_layout(layout) + + def add_conv2d_weight_indices(self): + if self.conv2d_weights: + self.add_array(f"{self.arch}.conv2d_weights", self.conv2d_weights) batch_norm_eps = 1e-5 @@ -124,7 +170,8 @@ def fuse_conv_2d_batch_norm( fused_weight = conv_weight * bn_weight[:, None, None, None] fused_bias = (conv_bias - bn_mean) * bn_weight + bn_bias - writer.add_tensor(name, conv_2d_to_nhwc(fused_weight)) + fused_weight = writer.convert_tensor_2d(fused_weight) + writer.add_tensor(name, fused_weight) writer.add_tensor(name.replace("weight", "bias"), fused_bias) return True @@ -135,7 +182,7 @@ def fuse_conv_2d_batch_norm( elif suffix_norm in key: return True # batch norm was fused above - return False # no match + return False # tensor is not part of conv2d+batch-norm # @@ -144,6 +191,7 @@ def fuse_conv_2d_batch_norm( def convert_sam(input_filepath: Path, writer: Writer): writer.add_license("apache-2.0") + writer.set_tensor_layout_default(TensorLayout.nchw) model: dict[str, Tensor] = torch.load(input_filepath, map_location="cpu", weights_only=True) @@ -161,12 +209,19 @@ def convert_sam(input_filepath: Path, writer: Writer): name = name + "_indexed" tensor = tensor[:, attention_bias_idxs] + if "local_conv" in key: # always convert to nhwc + original_tensor_layout = writer.tensor_layout + writer.tensor_layout = TensorLayout.nhwc + fuse_conv_2d_batch_norm(model, key, name, "", "c", "bn", writer) + writer.tensor_layout = original_tensor_layout + continue + if fuse_conv_2d_batch_norm(model, key, name, "", "c", "bn", writer): continue if name.endswith("neck.0.weight") or name.endswith("neck.2.weight"): assert tensor.shape[2] == tensor.shape[3] and tensor.shape[2] <= 3 - tensor = conv_2d_to_nhwc(tensor) + tensor = writer.convert_tensor_2d(tensor) # Precompute dense positional embeddings from random matrix stored in the model if name == "prompt_encoder.pe_layer.positional_encoding_gaussian_matrix": @@ -221,10 +276,29 @@ def build_dense_positional_embeddings( def convert_birefnet(input_filepath: Path, writer: Writer): writer.add_license("mit") + writer.set_tensor_layout_default(TensorLayout.nchw) weights = safetensors.safe_open(input_filepath, "pt") model: dict[str, Tensor] = {k: weights.get_tensor(k) for k in weights.keys()} + x = model["bb.layers.0.blocks.0.attn.proj.bias"] + if x.shape[0] == 96: + writer.add_string("swin.config", "tiny") + writer.add_int32("swin.embed_dim", 96) + elif x.shape[0] == 192: + writer.add_string("swin.config", "large") + writer.add_int32("swin.embed_dim", 192) + else: + raise ValueError(f"Unsupported Swin Transformer embed dim: {x.shape[0]}") + + image_size = 1024 + if "HR" in input_filepath.name or "2K" in input_filepath.name: + image_size = 2048 # actually 2K should rather be 2560x1440 + elif "dynamic" in input_filepath.name: + image_size = -1 + writer.add_int32("birefnet.image_size", image_size) + writer.add_int32("birefnet.image_multiple", 128) + for key, tensor in model.items(): # Shorten some names to fit into 64 chars name = key @@ -259,7 +333,10 @@ def convert_birefnet(input_filepath: Path, writer: Writer): continue # batch norm was fused if is_conv_2d(name, tensor): - tensor = conv_2d_to_nhwc(tensor) + if "patch_embed" in name: # part of SWIN, always store as NHWC + tensor = conv_2d_to_nhwc(tensor) + else: # store rest in requested tensor layout + tensor = writer.convert_tensor_2d(tensor) writer.add_tensor(name, tensor) @@ -270,12 +347,18 @@ def convert_birefnet(input_filepath: Path, writer: Writer): def convert_migan(input_filepath: Path, writer: Writer): writer.add_license("mit") + writer.set_tensor_layout_default(TensorLayout.nchw) model: dict[str, Tensor] = torch.load(input_filepath, weights_only=True) + if "encoder.b512.fromrgb.weight" in model: + writer.add_int32("migan.image_size", 512) + elif "encoder.b256.fromrgb.weight" in model: + writer.add_int32("migan.image_size", 256) + for name, tensor in model.items(): if is_conv_2d(name, tensor): - tensor = conv_2d_to_nhwc(tensor) + tensor = writer.convert_tensor_2d(tensor) writer.add_tensor(name, tensor) @@ -296,10 +379,17 @@ def convert_esrgan(input_filepath: Path, writer: Writer): if getattr(model.model, "plus", False): raise ValueError("RealESRGAN+ (plus) models are not supported yet.") + writer.set_tensor_layout_default(TensorLayout.nchw) + writer.add_int32("esrgan.scale", model.scale) + for tag in model.tags: + if tag.endswith("nb"): + writer.add_int32("esrgan.block_count", int(tag[:-2])) + if tag.endswith("nf"): + writer.add_int32("esrgan.filter_count", int(tag[:-2])) + for name, tensor in model.model.state_dict().items(): if is_conv_2d(name, tensor): - tensor = conv_2d_to_nhwc(tensor) - + tensor = writer.convert_tensor_2d(tensor) writer.add_tensor(name, tensor) @@ -319,10 +409,11 @@ def convert_esrgan(input_filepath: Path, writer: Writer): if __name__ == "__main__": # fmt: off parser = argparse.ArgumentParser(description="Convert model weights (.pt/.pth/.safetensors) to GGUF format.") - parser.add_argument("arch", choices=["sam", "birefnet", "migan", "esrgan"], help="Model architecture") + parser.add_argument("arch", choices=list(arch_names.keys()), help="Model architecture") parser.add_argument("input", type=str, help="Path to the input model file") parser.add_argument("--output", "-o", type=str, default="models", help="Path to the output directory or file") parser.add_argument("--quantize", "-q", choices=["f16"], default=None, help="Convert float weights to the specified data type") + parser.add_argument("--layout", "-l", choices=["whcn", "cwhn"], default=None, help="Tensor data layout for 2D operations like convolution") parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose output") parser.add_argument("--model-name", type=str, default=None, help="Name of the model for metadata") parser.add_argument("--metadata", type=Path, help="Specify the path for an authorship metadata override file") @@ -332,8 +423,9 @@ def convert_esrgan(input_filepath: Path, writer: Writer): input_path = Path(args.input) output_path = Path(args.output) quant_suffix = f"-{args.quantize.upper()}" if args.quantize else "" + layout_suffix = f"-{args.layout.upper()}" if args.layout else "" if output_path.is_dir() or output_path.suffix != ".gguf": - output_path = output_path / f"{input_path.stem}{quant_suffix}.gguf" + output_path = output_path / f"{input_path.stem}{quant_suffix}{layout_suffix}.gguf" print(f"Converting {args.arch}") print("* input: ", input_path) @@ -348,6 +440,9 @@ def convert_esrgan(input_filepath: Path, writer: Writer): ) metadata = Metadata.load(args.metadata, input_path.with_suffix(""), args.model_name) + if args.layout is not None: + writer.set_tensor_layout(TensorLayout.parse(args.layout)) + match args.arch: case "sam": convert_sam(input_path, writer) @@ -362,8 +457,8 @@ def convert_esrgan(input_filepath: Path, writer: Writer): metadata.set_gguf_meta_model(writer) writer.add_quantization_version(GGML_QUANT_VERSION) - writer.add_tensor_data_layout("cwhn") writer.add_file_type(file_types[args.quantize]) + writer.add_conv2d_weight_indices() writer.write_header_to_file() writer.write_kv_data_to_file() writer.write_tensors_to_file(progress=True) diff --git a/src/cli/CMakeLists.txt b/src/cli/CMakeLists.txt index 53fc9a0..aaf21d2 100644 --- a/src/cli/CMakeLists.txt +++ b/src/cli/CMakeLists.txt @@ -1,6 +1,7 @@ add_executable(vision-cli) target_sources(vision-cli PRIVATE cli.cpp) target_include_directories(vision-cli PRIVATE ..) -target_compile_definitions(vision-cli PRIVATE ${VISP_ASSERT} ${VISP_FMT_DEFS}) -target_compile_options(vision-cli PRIVATE ${VISP_WARNINGS}) +target_compile_definitions(vision-cli PRIVATE ${VISP_ASSERT} ${VISP_DEFINITIONS}) +target_compile_options(vision-cli PRIVATE ${VISP_WARNINGS} ${VISP_COMP_OPTIONS}) +target_link_options(vision-cli PRIVATE ${VISP_LINK_OPTIONS}) target_link_libraries(vision-cli PRIVATE visioncpp ${VISP_FMT_LINK}) \ No newline at end of file diff --git a/src/cli/cli.cpp b/src/cli/cli.cpp index ba53776..25f2ee9 100644 --- a/src/cli/cli.cpp +++ b/src/cli/cli.cpp @@ -1,6 +1,6 @@ -#include "util/math.hpp" -#include "util/string.hpp" -#include "visp/vision.hpp" +#include "util/math.h" +#include "util/string.h" +#include "visp/vision.h" #include #include @@ -32,7 +32,7 @@ struct cli_args { }; void print_usage() { -char const* const usage = R"( + char const* const usage = R"( Usage: vision-cli [options] Commands: @@ -181,6 +181,7 @@ int main(int argc, char** argv) { case cli_command::birefnet: run_birefnet(args); break; case cli_command::migan: run_migan(args); break; case cli_command::esrgan: run_esrgan(args); break; + case cli_command::none: break; } } catch (std::exception const& e) { @@ -231,22 +232,38 @@ backend_device backend_init(cli_args const& args) { return b; } -model_weights load_model_weights( - cli_args const& args, backend_device const& b, char const* default_model, int n_tensors = 0) { +char const* to_string(tensor_data_layout l) { + switch (l) { + case tensor_data_layout::cwhn: return "cwhn"; + case tensor_data_layout::whcn: return "whcn"; + default: return "unknown"; + } +} + +std::tuple load_model_weights( + cli_args const& args, + backend_device const& dev, + char const* default_model, + int n_tensors = 0, + tensor_data_layout preferred_layout = tensor_data_layout::unknown) { timer t; char const* model_path = args.model ? args.model : default_model; printf("Loading model weights from '%s'... ", model_path); - model_load_params load_params = { - .float_type = b.preferred_float_type(), - .n_extra_tensors = n_tensors, - }; - model_weights weights = model_load(model_path, b, load_params); + model_file file = model_load(model_path); + model_weights weights = model_init(file.n_tensors() + n_tensors); + if (preferred_layout == tensor_data_layout::unknown) { + preferred_layout = file.tensor_layout(); + } + model_transfer(file, weights, dev, dev.preferred_float_type(), preferred_layout); printf("done (%s)\n", t.elapsed_str()); printf("- float type: %s\n", ggml_type_name(weights.float_type())); - return weights; + if (preferred_layout != tensor_data_layout::unknown) { + printf("- tensor layout: %s\n", to_string(preferred_layout)); + } + return {std::move(file), std::move(weights)}; } void compute_timed(compute_graph const& g, backend_device const& b) { @@ -323,7 +340,8 @@ sam_prompt sam_parse_prompt(std::span args, i32x2 extent) { void run_sam(cli_args const& args) { backend_device backend = backend_init(args); - model_weights weights = load_model_weights(args, backend, "models/MobileSAM-F16.gguf"); + auto [file, weights] = load_model_weights( + args, backend, "models/MobileSAM-F16.gguf", 0, backend.preferred_layout()); sam_params params{}; require_inputs(args.inputs, 1, ""); @@ -376,33 +394,36 @@ void run_sam(cli_args const& args) { void run_birefnet(cli_args const& args) { backend_device backend = backend_init(args); - model_weights weights = load_model_weights(args, backend, "models/BiRefNet-F16.gguf", 6); - birefnet_params params = birefnet_detect_params(weights); - int img_size = params.image_size; + auto [file, weights] = load_model_weights( + args, backend, "models/BiRefNet-F16.gguf", 0, backend.preferred_layout()); require_inputs(args.inputs, 1, ""); image_data image = image_load(args.inputs[0]); + birefnet_params params = birefnet_detect_params(file, image.extent); image_data input_data = birefnet_process_input(image, params); - birefnet_buffers buffers = birefnet_precompute(model_ref(weights), params); - model_allocate(weights, backend); - for (tensor_data const& buf : buffers) { - transfer_to_backend(buf); - } + i32x2 extent = params.image_extent; + char const* image_size_str = params.image_size < 0 ? " (dynamic)" : ""; + printf("- model image size: %d%s\n", params.image_size, image_size_str); + printf("- inference image size: %dx%d\n", extent[0], extent[1]); compute_graph graph = compute_graph_init(6 * 1024); model_ref m(weights, graph); - tensor input = compute_graph_input(m, GGML_TYPE_F32, {3, img_size, img_size, 1}); + birefnet_buffers buffers = birefnet_precompute(m, params); + tensor input = compute_graph_input(m, GGML_TYPE_F32, {3, extent[0], extent[1], 1}); tensor output = birefnet_predict(m, input, params); compute_graph_allocate(graph, backend); transfer_to_backend(input, input_data); + for (tensor_data const& buf : buffers) { + transfer_to_backend(buf); + } compute_timed(graph, backend); tensor_data mask_data = transfer_from_backend(output); - image_view mask_output({img_size, img_size}, mask_data.as_f32()); + image_view mask_output(extent, mask_data.as_f32()); image_data mask_resized = image_scale(mask_output, image.extent); image_data mask = image_f32_to_u8(mask_resized, image_format::alpha_u8); image_save(mask, args.output); @@ -416,8 +437,9 @@ void run_birefnet(cli_args const& args) { void run_migan(cli_args const& args) { backend_device backend = backend_init(args); - model_weights weights = load_model_weights(args, backend, "models/MIGAN-512-places2-F16.gguf"); - migan_params params = migan_detect_params(weights); + auto [file, weights] = load_model_weights( + args, backend, "models/MIGAN-512-places2-F16.gguf", backend.preferred_layout()); + migan_params params = migan_detect_params(file); params.invert_mask = true; // -> inpaint opaque areas require_inputs(args.inputs, 2, " "); @@ -453,8 +475,11 @@ void run_migan(cli_args const& args) { void run_esrgan(cli_args const& args) { backend_device backend = backend_init(args); - model_weights weights = load_model_weights(args, backend, "models/RealESRGAN-x4.gguf"); - esrgan_params params = esrgan_detect_params(weights); + auto [file, weights] = load_model_weights( + args, backend, "models/RealESRGAN-x4.gguf", 0, backend.preferred_layout()); + esrgan_params params = esrgan_detect_params(file); + printf("- scale: %dx\n", params.scale); + printf("- block count: %d\n", params.n_blocks); require_inputs(args.inputs, 1, ""); image_data image = image_load(args.inputs[0]); @@ -465,6 +490,7 @@ void run_esrgan(cli_args const& args) { image_data input_tile = image_alloc(tiles.tile_size, image_format::rgb_f32); image_data output_tile = image_alloc(tiles_out.tile_size, image_format::rgb_f32); image_data output_image = image_alloc(image.extent * params.scale, image_format::rgb_f32); + image_clear(output_image); compute_graph graph = compute_graph_init(esrgan_estimate_graph_size(params)); model_ref m(weights, graph); diff --git a/src/util/math.hpp b/src/util/math.h similarity index 95% rename from src/util/math.hpp rename to src/util/math.h index 02e290d..835229d 100644 --- a/src/util/math.hpp +++ b/src/util/math.h @@ -1,9 +1,9 @@ #pragma once -#include "visp/util.hpp" +#include "visp/util.h" #include -#include +#include namespace visp { using std::clamp; @@ -12,6 +12,8 @@ using std::clamp; constexpr int32_t div_ceil(int32_t a, int32_t b) { return (a + b - 1) / b; } constexpr int64_t div_ceil(int64_t a, int64_t b) { return (a + b - 1) / b; } +constexpr int32_t next_multiple(int32_t x, int32_t mult) { return div_ceil(x, mult) * mult; } + constexpr float sqr(float x) { return x * x; } constexpr int sqr(int x) { return x * x; } diff --git a/src/util/string.hpp b/src/util/string.h similarity index 99% rename from src/util/string.hpp rename to src/util/string.h index a0b61a9..220b751 100644 --- a/src/util/string.hpp +++ b/src/util/string.h @@ -1,6 +1,6 @@ #pragma once -#include "visp/util.hpp" +#include "visp/util.h" #include #include diff --git a/src/visp/CMakeLists.txt b/src/visp/CMakeLists.txt index d9ecb4e..5cdbd54 100644 --- a/src/visp/CMakeLists.txt +++ b/src/visp/CMakeLists.txt @@ -12,21 +12,22 @@ target_sources(visioncpp PRIVATE vision.cpp ) target_compile_features(visioncpp PUBLIC cxx_std_20) -target_compile_definitions(visioncpp PRIVATE VISP_API_EXPORT ${VISP_ASSERT} ${VISP_FMT_DEFS}) -target_compile_options(visioncpp PRIVATE ${VISP_WARNINGS}) +target_compile_definitions(visioncpp PRIVATE VISP_API_EXPORT ${VISP_ASSERT} ${VISP_DEFINITIONS}) +target_compile_options(visioncpp PRIVATE ${VISP_WARNINGS} ${VISP_COMP_OPTIONS}) target_include_directories(visioncpp PUBLIC $ $ PRIVATE .. ) +target_link_options(visioncpp PRIVATE ${VISP_LINK_OPTIONS}) target_link_libraries(visioncpp PUBLIC ggml PRIVATE stb ${VISP_FMT_LINK} ) -set_target_properties(visioncpp PROPERTIES - VERSION ${PROJECT_VERSION} - SOVERSION ${PROJECT_VERSION_MAJOR} -) +# set_target_properties(visioncpp PROPERTIES +# VERSION ${PROJECT_VERSION} +# SOVERSION ${PROJECT_VERSION_MAJOR} +# ) if (MSVC AND VISP_TESTS) set_target_properties(visioncpp PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) diff --git a/src/visp/arch/birefnet.cpp b/src/visp/arch/birefnet.cpp index ec2d780..9c9ad37 100644 --- a/src/visp/arch/birefnet.cpp +++ b/src/visp/arch/birefnet.cpp @@ -1,13 +1,11 @@ -#include "visp/arch/birefnet.hpp" -#include "visp/nn.hpp" -#include "visp/vision.hpp" -#include "util/math.hpp" -#include "util/string.hpp" +#include "visp/arch/birefnet.h" +#include "util/math.h" +#include "util/string.h" +#include "visp/nn.h" +#include "visp/vision.h" #include -#include - namespace visp { namespace birefnet { @@ -86,8 +84,8 @@ tensor window_attention(model_ref m, tensor x, tensor mask, int num_heads, int w tensor attn = ggml_mul_mat(m, k, q); - tensor rel_pos_index = - m.with_prefix(format("window_attention_{}", window)).weights("rel_pos_index"); + tensor_name rel_pos_name = format("window_attention_{}.rel_pos_index", window); + tensor rel_pos_index = ggml_get_tensor(m, rel_pos_name.c_str()); tensor rel_pos_table = m.weights("relative_position_bias_table"); tensor rel_pos_bias = ggml_get_rows(m, rel_pos_table, rel_pos_index); rel_pos_bias = ggml_reshape_4d(m, rel_pos_bias, num_heads, window * window, window * window, 1); @@ -235,18 +233,18 @@ tensor_data create_attention_mask(ggml_context* ctx, int64_t w, int64_t h, int w swin_layer_result swin_layer( model_ref m, tensor x, int64_t w, int64_t h, swin_layer_t const& p, int window_size) { // Attention masks need to be precomputed - tensor attn_mask = - m.with_prefix(format("swin_layer_{}x{}", w, h)).find("attn_mask"); + tensor_name attn_mask_name = format("swin_layer_{}x{}.attn_mask", w, h); + tensor attn_mask = ggml_get_tensor(m, attn_mask_name.c_str()); model_ref blocks = m["blocks"]; for (int i = 0; i < p.depth; ++i) { - swin_block_params block_params = { - .n_heads = p.n_heads, - .window_size = window_size, - .w = w, - .h = h, - .shift = i % 2 == 0 ? 0 : window_size / 2}; - x = swin_block(blocks[i], x, attn_mask, block_params); + x = swin_block( + blocks[i], x, attn_mask, + {.n_heads = p.n_heads, + .window_size = window_size, + .w = w, + .h = h, + .shift = i % 2 == 0 ? 0 : window_size / 2}); } if (p.downsample) { tensor x_down = patch_merging(m["downsample"], x, w, h); @@ -258,6 +256,7 @@ swin_layer_result swin_layer( tensor patch_embed(model_ref m, tensor x, int patch_size) { ASSERT(x->ne[1] % patch_size == 0 && x->ne[2] % patch_size == 0); + m.flags |= model_build_flag::cwhn; x = conv_2d(m["proj"], x, patch_size); auto [c, ww, wh, b] = nelements(x); x = ggml_reshape_3d(m, x, c, ww * wh, b); @@ -287,17 +286,19 @@ swin_result swin_transformer(model_ref m, tensor x, swin_params const& p) { return outs; } -constexpr int32_t bilinear_align_corners = GGML_SCALE_MODE_BILINEAR | (int)GGML_SCALE_FLAG_ALIGN_CORNERS; +constexpr int32_t bilinear_align_corners = GGML_SCALE_MODE_BILINEAR | + (int)GGML_SCALE_FLAG_ALIGN_CORNERS; tensor upscale_to_whcn(model_ref m, tensor x, tensor target) { return interpolate(m, x, {target->ne[0], target->ne[1]}, bilinear_align_corners); } tensor upscale_to(model_ref m, tensor x, tensor target) { - x = permute_cwhn_to_whcn(m, x); - x = interpolate(m, x, {target->ne[1], target->ne[2]}, bilinear_align_corners); - x = permute_whcn_to_cwhn(m, x); - return ggml_cont(m, x); + auto [target_width, target_height, c, n] = nelements_whcn(m, target); + x = contiguous_2d_to_whcn(m, x); + x = interpolate(m, x, {target_width, target_height}, bilinear_align_corners); + x = whcn_to_contiguous_2d(m, x); + return x; } tensor downscale_by_whcn(model_ref m, tensor x, int f) { @@ -305,34 +306,32 @@ tensor downscale_by_whcn(model_ref m, tensor x, int f) { } tensor downscale_by(model_ref m, tensor x, int f) { - x = permute_cwhn_to_whcn(m, x); + x = ggml_cont(m, permute_cwhn_to_whcn(m, x)); x = downscale_by_whcn(m, x, f); - x = permute_whcn_to_cwhn(m, x); - return ggml_cont(m, x); + x = ggml_cont(m, permute_whcn_to_cwhn(m, x)); + return x; } swin_result encode_concat(model_ref m, swin_result& xs, swin_result& xs_low) { // TODO: implement cwhn upscale/interpolate which allows downscale & align_corners=True - // cwhn -> whcn for (int i = 0; i < 4; ++i) { - xs[i] = ggml_cont(m, ggml_permute(m, xs[i], 2, 0, 1, 3)); - xs_low[i] = ggml_permute(m, xs_low[i], 2, 0, 1, 3); + xs[i] = ggml_cont(m, permute_cwhn_to_whcn(m, xs[i])); + xs_low[i] = permute_cwhn_to_whcn(m, xs_low[i]); } - + // clang-format off xs[0] = concat(m, {xs[0], upscale_to_whcn(m, xs_low[0], xs[0])}, 2); xs[1] = concat(m, {xs[1], upscale_to_whcn(m, xs_low[1], xs[1])}, 2); xs[2] = concat(m, {xs[2], upscale_to_whcn(m, xs_low[2], xs[2])}, 2); xs[3] = concat(m, {xs[3], upscale_to_whcn(m, xs_low[3], xs[3])}, 2); + xs[3] = concat(m, {downscale_by_whcn(m, xs[0], 8), + downscale_by_whcn(m, xs[1], 4), + downscale_by_whcn(m, xs[2], 2), + xs[3]}, /*dim = */ 2); + // clang-format on - xs[3] = concat( - m, - {downscale_by_whcn(m, xs[0], 8), downscale_by_whcn(m, xs[1], 4), - downscale_by_whcn(m, xs[2], 2), xs[3]}, - /*dim = */ 2); - - // whcn -> cwhn + // whcn -> native for (int i = 0; i < 4; ++i) { - xs[i] = ggml_cont(m, ggml_permute(m, xs[i], 1, 2, 0, 3)); + xs[i] = whcn_to_contiguous_2d(m, xs[i]); } return xs; } @@ -364,12 +363,11 @@ tensor deformable_conv_2d(model_ref m, tensor x, int stride, int pad) { } tensor mean_2d(model_ref m, tensor x) { - auto [c, w, h, n] = nelements(x); - x = ggml_cont(m, ggml_permute(m, x, 2, 0, 1, 3)); // cwhn -> whcn - x = ggml_mean(m, x); - x = ggml_reshape_3d(m, x, h, c, n); + auto [w, h, c, n] = nelements_whcn(m, x); + x = contiguous_2d_to_whcn(m, x); + x = ggml_reshape_3d(m, x, w * h, c, n); x = ggml_mean(m, x); - x = ggml_reshape_4d(m, x, c, 1, 1, n); + x = is_cwhn(m) ? ggml_reshape_4d(m, x, c, 1, 1, n) : ggml_reshape_4d(m, x, 1, 1, c, n); return x; } @@ -389,6 +387,7 @@ tensor aspp_module_deformable(model_ref m, tensor x, int padding) { tensor aspp_deformable(model_ref m, tensor x) { const int kernel_sizes[] = {1, 3, 7}; + const int channel_dim = is_cwhn(m) ? 0 : 2; tensor x1 = aspp_module_deformable(m["aspp1"], x); model_ref aspp_deforms = m["aspp_deforms"]; @@ -398,10 +397,11 @@ tensor aspp_deformable(model_ref m, tensor x) { x_deforms[i] = aspp_module_deformable(aspp_deforms[i], x, padding); } tensor x5 = global_avg_pool(m["global_avg_pool"], x); - x5 = permute_cwhn_to_whcn(m, x5); - x5 = interpolate(m, x5, {x1->ne[1], x1->ne[2]}, bilinear_align_corners); - x5 = ggml_cont(m, permute_whcn_to_cwhn(m, x5)); - x = concat(m, {x1, x_deforms[0], x_deforms[1], x_deforms[2], x5}, 0); + auto [w1, h1, c, n] = nelements_whcn(m, x1); + x5 = contiguous_2d_to_whcn(m, x5); + x5 = interpolate(m, x5, {w1, h1}, bilinear_align_corners); + x5 = whcn_to_contiguous_2d(m, x5); + x = concat(m, {x1, x_deforms[0], x_deforms[1], x_deforms[2], x5}, channel_dim); x = conv_2d_batch_norm(m["conv1"], x); x = ggml_relu_inplace(m, x); @@ -440,17 +440,22 @@ tensor gdt_conv(model_ref m, tensor x) { } tensor decode(model_ref m, tensor x, swin_result const& features) { + const int channel_dim = is_cwhn(m) ? 0 : 2; + tensor x1 = features[0]; tensor x2 = features[1]; tensor x3 = features[2]; tensor x4 = features[3]; - tensor x_whcn = ggml_cont(m, ggml_permute(m, x, 2, 0, 1, 3)); // cwhn -> whcn - + tensor x_whcn = ggml_cont(m, permute_cwhn_to_whcn(m, x)); + if (is_whcn(m)) { + x = x_whcn; + } { - tensor patches = image_to_patches(m, x_whcn, x4->ne[1], x4->ne[2]); - patches = ggml_cont(m, ggml_permute(m, patches, 1, 2, 0, 3)); // whcn -> cwhn + auto [w, h, c, n] = nelements_whcn(m, x4); + tensor patches = image_to_patches(m, x_whcn, w, h); + patches = whcn_to_contiguous_2d(m, patches); patches = simple_conv(m["ipt_blk5"], patches); - x4 = ggml_concat(m, x4, patches, 0); + x4 = ggml_concat(m, x4, patches, channel_dim); } tensor p4 = basic_decoder_block(m["block4"], x4); tensor p4_gdt = gdt_conv(m["gdt_convs_4"], p4); @@ -463,10 +468,11 @@ tensor decode(model_ref m, tensor x, swin_result const& features) { tensor _p3 = ggml_add_inplace(m, _p4, x3); { - tensor patches = image_to_patches(m, x_whcn, _p3->ne[1], _p3->ne[2]); - patches = ggml_cont(m, ggml_permute(m, patches, 1, 2, 0, 3)); // whcn -> cwhn + auto [w, h, c, n] = nelements_whcn(m, _p3); + tensor patches = image_to_patches(m, x_whcn, w, h); + patches = whcn_to_contiguous_2d(m, patches); patches = simple_conv(m["ipt_blk4"], patches); - _p3 = ggml_concat(m, _p3, patches, 0); + _p3 = ggml_concat(m, _p3, patches, channel_dim); } tensor p3 = basic_decoder_block(m["block3"], _p3); tensor p3_gdt = gdt_conv(m["gdt_convs_3"], p3); @@ -479,10 +485,11 @@ tensor decode(model_ref m, tensor x, swin_result const& features) { tensor _p2 = ggml_add_inplace(m, _p3, x2); { - tensor patches = image_to_patches(m, x_whcn, _p2->ne[1], _p2->ne[2]); - patches = ggml_cont(m, ggml_permute(m, patches, 1, 2, 0, 3)); // whcn -> cwhn + auto [w, h, c, n] = nelements_whcn(m, _p2); + tensor patches = image_to_patches(m, x_whcn, w, h); + patches = whcn_to_contiguous_2d(m, patches); patches = simple_conv(m["ipt_blk3"], patches); - _p2 = ggml_concat(m, _p2, patches, 0); + _p2 = ggml_concat(m, _p2, patches, channel_dim); } tensor p2 = basic_decoder_block(m["block2"], _p2); tensor p2_gdt = gdt_conv(m["gdt_convs_2"], p2); @@ -495,15 +502,16 @@ tensor decode(model_ref m, tensor x, swin_result const& features) { tensor _p1 = ggml_add_inplace(m, _p2, x1); { - tensor patches = image_to_patches(m, x_whcn, _p1->ne[1], _p1->ne[2]); - patches = ggml_cont(m, ggml_permute(m, patches, 1, 2, 0, 3)); // whcn -> cwhn + auto [w, h, c, n] = nelements_whcn(m, _p1); + tensor patches = image_to_patches(m, x_whcn, w, h); + patches = whcn_to_contiguous_2d(m, patches); patches = simple_conv(m["ipt_blk2"], patches); - _p1 = ggml_concat(m, _p1, patches, 0); + _p1 = ggml_concat(m, _p1, patches, channel_dim); } _p1 = basic_decoder_block(m["block1"], _p1); _p1 = upscale_to(m, _p1, x); tensor p1_ipt = simple_conv(m["ipt_blk1"], x); - _p1 = ggml_concat(m, _p1, p1_ipt, 0); + _p1 = ggml_concat(m, _p1, p1_ipt, channel_dim); tensor p1_out = conv_2d(m["conv_out1.0"], _p1); p1_out = ggml_sigmoid_inplace(m, p1_out); @@ -528,10 +536,10 @@ image_data birefnet_process_input(image_view image, birefnet_params const& p) { constexpr f32x4 mean = f32x4{0.485f, 0.456f, 0.406f, 0.f}; constexpr f32x4 std = f32x4{0.229f, 0.224f, 0.225f, 1.f}; - std::optional resized; - if (image.extent[0] != p.image_size || image.extent[1] != p.image_size) { - resized = image_scale(image, i32x2{p.image_size, p.image_size}); - image = image_view(*resized); + image_data resized; + if (image.extent != p.image_extent) { + resized = image_scale(image, p.image_extent); + image = image_view(resized); } return image_u8_to_f32(image, image_format::rgb_f32, -mean, 1.f / std); @@ -540,10 +548,9 @@ image_data birefnet_process_input(image_view image, birefnet_params const& p) { image_data birefnet_process_output( span mask_data, i32x2 target_extent, birefnet_params const& p) { - i32x2 model_extent = {p.image_size, p.image_size}; - image_view mask_output(model_extent, mask_data); + image_view mask_output(p.image_extent, mask_data); image_data mask_resized; - if (model_extent != target_extent) { + if (p.image_extent != target_extent) { mask_resized = image_scale(mask_output, target_extent); mask_output = mask_resized; } @@ -552,12 +559,13 @@ image_data birefnet_process_output( birefnet_buffers birefnet_precompute(model_ref m, birefnet_params const& params) { int w = params.encoder.window_size; - int res = params.image_size / 4; + int width = params.image_extent[0] / 4; + int height = params.image_extent[1] / 4; birefnet_buffers b; b[0] = birefnet::create_relative_position_index(m, w); for (int i = 0; i < swin_params::n_layers + 1; ++i) { - b[i + 1] = birefnet::create_attention_mask(m, res >> i, res >> i, w); + b[i + 1] = birefnet::create_attention_mask(m, width >> i, height >> i, w); } return b; } @@ -584,24 +592,37 @@ const swin_params swin_l_params = { swin_layer_t{2, 48, 192 * 8, false}}}; // clang-format on -swin_params swin_detect_params(model_ref m) { - tensor t = m.find("bb.layers.0.blocks.0.attn.proj.bias"); - if (t == nullptr) { - throw except("Failed to detect model parameters"); - } - if (t->ne[0] == 96) { +swin_params swin_detect_params(model_file const& f) { + int embed_dim = f.get_int("swin.embed_dim"); + if (embed_dim == 96) { return swin_t_params; - } else if (t->ne[0] == 192) { + } else if (embed_dim == 192) { return swin_l_params; } else { - throw except("Unsupported Swin Transformer embed dim: {}", t->ne[0]); + throw except("Unsupported Swin Transformer embed dim: {}", embed_dim); } } -birefnet_params birefnet_detect_params(model_ref m) { +i32x2 birefnet_image_extent(i32x2 input_extent, birefnet_params const& p) { + i32x2 extent{p.image_size, p.image_size}; + if (p.image_size == -1) { + ASSERT(input_extent[0] > 0 && input_extent[1] > 0); + extent = { + next_multiple(input_extent[0], p.image_multiple), + next_multiple(input_extent[1], p.image_multiple)}; + } + return extent; +} + +birefnet_params birefnet_detect_params(model_file const& f, i32x2 dynamic_extent) { + if (std::string_view arch = f.arch(); arch != "birefnet") { + throw except("Architecture expected to be 'birefnet', but was '{}' ({})", arch, f.path); + } birefnet_params p; - p.image_size = 1024; // TODO: support 2K models - p.encoder = swin_detect_params(m); + p.image_size = f.get_int("birefnet.image_size"); + p.image_multiple = f.get_int("birefnet.image_multiple"); + p.image_extent = birefnet_image_extent(dynamic_extent, p); + p.encoder = swin_detect_params(f); return p; } diff --git a/src/visp/arch/birefnet.hpp b/src/visp/arch/birefnet.h similarity index 97% rename from src/visp/arch/birefnet.hpp rename to src/visp/arch/birefnet.h index 9cbbda2..7f109ad 100644 --- a/src/visp/arch/birefnet.hpp +++ b/src/visp/arch/birefnet.h @@ -1,7 +1,7 @@ #pragma once -#include "visp/ml.hpp" -#include "visp/image.hpp" +#include "visp/ml.h" +#include "visp/image.h" #include diff --git a/src/visp/arch/esrgan.cpp b/src/visp/arch/esrgan.cpp index fdf48e0..a10deb7 100644 --- a/src/visp/arch/esrgan.cpp +++ b/src/visp/arch/esrgan.cpp @@ -1,7 +1,7 @@ -#include "visp/arch/esrgan.hpp" -#include "visp/nn.hpp" -#include "visp/vision.hpp" -#include "util/string.hpp" +#include "visp/arch/esrgan.h" +#include "util/string.h" +#include "visp/nn.h" +#include "visp/vision.h" #include #include @@ -10,8 +10,8 @@ namespace visp { namespace esrgan { tensor upsample(model_ref m, tensor x) { - auto [c, w, h, n] = nelements(x); - x = ggml_interpolate(m, x, int(c), int(w * 2), int(h * 2), int(n), GGML_SCALE_MODE_NEAREST); + auto [w, h, c, n] = nelements_whcn(m, x); + x = interpolate(m, x, {w * 2, h * 2}, GGML_SCALE_MODE_NEAREST); x = conv_2d(m, x, 1, 1); x = ggml_leaky_relu(m, x, 0.2f, true); return named(m, x); @@ -24,14 +24,15 @@ tensor conv_block(model_ref m, tensor x) { } tensor risidual_dense_block(model_ref m, tensor x) { + int dim = (m.flags & model_build_flag::cwhn) ? 0 : 2; tensor x1 = conv_block(m["conv1"], x); - tensor c1 = concat(m, {x, x1}, 0); + tensor c1 = concat(m, {x, x1}, dim); tensor x2 = conv_block(m["conv2"], c1); - tensor c2 = concat(m, {c1, x2}, 0); + tensor c2 = concat(m, {c1, x2}, dim); tensor x3 = conv_block(m["conv3"], c2); - tensor c3 = concat(m, {c2, x3}, 0); + tensor c3 = concat(m, {c2, x3}, dim); tensor x4 = conv_block(m["conv4"], c3); - tensor c4 = concat(m, {c3, x4}, 0); + tensor c4 = concat(m, {c3, x4}, dim); tensor x5 = conv_2d(m["conv5.0"], c4, 1, 1); x5 = ggml_scale_inplace(m, x5, 0.2f); x = ggml_add(m, x, x5); @@ -52,6 +53,7 @@ tensor rrdb(model_ref m, tensor x) { tensor esrgan_generate(model_ref m, tensor x, esrgan_params const& p) { m = m["model"]; + x = cwhn_to_contiguous_2d(m, x); x = conv_2d(m[0], x, 1, 1); tensor sub = x; @@ -71,41 +73,23 @@ tensor esrgan_generate(model_ref m, tensor x, esrgan_params const& p) { x = ggml_leaky_relu(m, x, 0.2f, true); x = conv_2d(m[seq + 2], x, 1, 1); + x = contiguous_2d_to_cwhn(m, x); return compute_graph_output(m, x, "result"); } -esrgan_params esrgan_detect_params(model_ref m) { - esrgan_params p; - p.n_blocks = 0; - int model_len = 0; - - ggml_context* ctx = m.weights_context; - for (tensor t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) { - auto name = std::string_view(ggml_get_name(t)); - if (name.starts_with("model.")) { - name = name.substr(6); - int x = 0; - auto r = std::from_chars(name.data(), name.data() + 2, x); - model_len = std::max(model_len, x + 1); - - size_t i_dot = name.find('.'); - if (i_dot == std::string_view::npos) { - continue; - } - name = name.substr(i_dot + 1, 11); - if (name.starts_with("sub.") && (name.ends_with("RDB1") || name.ends_with("RDB1."))) { - r = std::from_chars(name.data() + 4, name.data() + 6, x); - p.n_blocks = std::max(p.n_blocks, x + 1); - } - } +esrgan_params esrgan_detect_params(model_file const& f) { + if (std::string_view arch = f.arch(); arch != "esrgan") { + throw except("Architecture expected to be 'esrgan', but was '{}' ({})", arch, f.path); } - // 3 layers per upscale block, each upscales x2, 5 blocks for the rest of the model - p.scale = 1 << ((model_len - 5) / 3); + esrgan_params p; + p.scale = f.get_int("esrgan.scale"); + p.n_blocks = f.get_int("esrgan.block_count"); + if (p.scale < 2 || p.scale > 4) { - throw except("Unsupported scale: {}", p.scale); + throw except("ESRGAN: unsupported scale: {}", p.scale); } if (p.n_blocks < 1 || p.n_blocks > 23) { - throw except("Invalid number of blocks: {}", p.n_blocks); + throw except("ESRGAN: invalid number of blocks: {}", p.n_blocks); } return p; } diff --git a/src/visp/arch/esrgan.hpp b/src/visp/arch/esrgan.h similarity index 91% rename from src/visp/arch/esrgan.hpp rename to src/visp/arch/esrgan.h index 2cfe32a..b5a7b7b 100644 --- a/src/visp/arch/esrgan.hpp +++ b/src/visp/arch/esrgan.h @@ -1,6 +1,6 @@ #pragma once -#include "visp/ml.hpp" +#include "visp/ml.h" namespace visp::esrgan { diff --git a/src/visp/arch/migan.cpp b/src/visp/arch/migan.cpp index 5371650..67f1b70 100644 --- a/src/visp/arch/migan.cpp +++ b/src/visp/arch/migan.cpp @@ -1,9 +1,9 @@ -#include "visp/arch/migan.hpp" -#include "visp/image-impl.hpp" -#include "visp/nn.hpp" -#include "visp/vision.hpp" -#include "util/math.hpp" -#include "util/string.hpp" +#include "visp/arch/migan.h" +#include "util/math.h" +#include "util/string.h" +#include "visp/image-impl.h" +#include "visp/nn.h" +#include "visp/vision.h" #include #include @@ -30,20 +30,30 @@ tensor downsample_2d(model_ref m, tensor x) { } tensor upsample_2d(model_ref m, tensor x) { - tensor filter_const = m.weights("filter_const"); - filter_const = ggml_reshape_4d(m, filter_const, 1, filter_const->ne[0], filter_const->ne[1], 1); + tensor filter = m.weights("filter_const"); + if (m.flags & model_build_flag::cwhn) { + filter = ggml_reshape_4d(m, filter, 1, filter->ne[0], filter->ne[1], 1); + } - auto [c, w, h, b] = nelements(x); - x = ggml_interpolate(m, x, int(c), int(w * 2), int(h * 2), int(b), GGML_SCALE_MODE_NEAREST); - x = ggml_mul_inplace(m, x, filter_const); + auto [w, h, c, n] = nelements_whcn(m, x); + x = interpolate(m, x, {w * 2, h * 2}, GGML_SCALE_MODE_NEAREST); + x = ggml_mul_inplace(m, x, filter); x = conv_2d_depthwise(m["filter"], x, 1, 2); // 4x4 filter - x = slice(m, x, {}, {0, -1}, {0, -1}, {}); // remove padding from right and bottom + + // remove padding from right and bottom + if (m.flags & model_build_flag::cwhn) { + x = slice(m, x, {}, {0, -1}, {0, -1}, {}); + } else { + x = slice(m, x, {0, -1}, {0, -1}, {}, {}); + } x = ggml_cont(m, x); // required by subsequent ggml_scale for some reason return named(m, x); } tensor separable_conv_2d(model_ref m, tensor x, flags flags) { - int pad = int(m["conv1"].weights("weight")->ne[2] / 2); + int kdim = (m.flags & model_build_flag::cwhn) ? 2 : 0; // to get kernel size + int pad = int(m["conv1"].weights("weight")->ne[kdim] / 2); + x = conv_2d_depthwise(m["conv1"], x, 1, pad); if (flags & conv::activation) { x = lrelu_agc(m, x, 0.2f, sqrt2, 256); @@ -60,7 +70,9 @@ tensor separable_conv_2d(model_ref m, tensor x, flags flags) { if (flags & conv::noise) { tensor noise = m.weights("noise_const"); noise = ggml_mul_inplace(m, noise, m.weights("noise_strength")); - noise = ggml_reshape_4d(m, noise, 1, noise->ne[0], noise->ne[1], 1); + if (m.flags & model_build_flag::cwhn) { + noise = ggml_reshape_4d(m, noise, 1, noise->ne[0], noise->ne[1], 1); + } x = ggml_add_inplace(m, x, noise); } if (flags & conv::activation) { @@ -70,6 +82,7 @@ tensor separable_conv_2d(model_ref m, tensor x, flags flags) { } tensor from_rgb(model_ref m, tensor x) { + x = cwhn_to_contiguous_2d(m, x); x = conv_2d(m["fromrgb"], x); x = lrelu_agc(m, x, 0.2f, sqrt2, 256); return named(m, x); @@ -122,6 +135,7 @@ tensor synthesis(model_ref m, tensor x_in, Features feats, int res) { model_ref block = m[format("b{}", res >> i)]; std::tie(x, img) = synthesis_block(block, x, feats[i], img, conv::upsample, conv::noise); } + img = contiguous_2d_to_cwhn(m, img); return img; } @@ -150,14 +164,13 @@ tensor migan_generate(model_ref m, tensor image, migan_params const& p) { return compute_graph_output(m, result); } -migan_params migan_detect_params(model_ref m) { - if (m.find("encoder.b512.fromrgb.weight") != nullptr) { - return migan_params{512}; - } else if (m.find("encoder.b256.fromrgb.weight") != nullptr) { - return migan_params{256}; - } else { - throw std::runtime_error("Failed to detect model parameters"); +migan_params migan_detect_params(model_file const& f) { + if (std::string_view arch = f.arch(); arch != "migan") { + throw except("Architecture expected to be 'migan', but was '{}' ({})", arch, f.path); } + migan_params p; + p.resolution = f.get_int("migan.image_size"); + return p; } image_data migan_process_input(image_view image, image_view mask, migan_params const& p) { @@ -182,7 +195,7 @@ image_data migan_process_input(image_view image, image_view mask, migan_params c } image_data migan_process_output(std::span data, i32x2 extent, migan_params const& p) { - i32x2 model_extent = {p.resolution,p.resolution}; + i32x2 model_extent = {p.resolution, p.resolution}; image_view image(model_extent, image_format::rgb_f32, data.data()); image_data resized; if (model_extent != extent) { diff --git a/src/visp/arch/migan.hpp b/src/visp/arch/migan.h similarity index 93% rename from src/visp/arch/migan.hpp rename to src/visp/arch/migan.h index d7bec80..e2ccd51 100644 --- a/src/visp/arch/migan.hpp +++ b/src/visp/arch/migan.h @@ -1,8 +1,8 @@ #pragma once -#include "visp/image.hpp" -#include "visp/ml.hpp" -#include "visp/util.hpp" +#include "visp/image.h" +#include "visp/ml.h" +#include "visp/util.h" #include #include diff --git a/src/visp/arch/mobile-sam.cpp b/src/visp/arch/mobile-sam.cpp index eb0da1f..7beaef4 100644 --- a/src/visp/arch/mobile-sam.cpp +++ b/src/visp/arch/mobile-sam.cpp @@ -1,8 +1,8 @@ -#include "visp/arch/mobile-sam.hpp" -#include "visp/nn.hpp" -#include "visp/vision.hpp" -#include "util/math.hpp" -#include "util/string.hpp" +#include "visp/arch/mobile-sam.h" +#include "visp/nn.h" +#include "visp/vision.h" +#include "util/math.h" +#include "util/string.h" #include @@ -23,17 +23,16 @@ tensor conv_2d_depthwise_batch_norm(model_ref m, tensor x, int stride = 1, int p } tensor window_partition(model_ref m, tensor x, int window) { - int64_t c = x->ne[0]; - int64_t b = x->ne[3]; + auto [c, w, h, b] = nelements(x); if (m.flags & model_build_flag::window_partition) { x = ggml_win_part(m, x, window); x = ggml_reshape_3d(m, x, c, window * window, x->ne[3]); return x; } - int64_t px = (window - x->ne[1] % window) % window; - int64_t py = (window - x->ne[2] % window) % window; - int64_t npw = (x->ne[1] + px) / window; - int64_t nph = (x->ne[2] + py) / window; + int64_t px = (window - w % window) % window; + int64_t py = (window - h % window) % window; + int64_t npw = (w + px) / window; + int64_t nph = (h + py) / window; if (px > 0 || py > 0) { x = ggml_pad(m, x, 0, int(px), int(py), 0); @@ -93,21 +92,24 @@ tensor mb_conv(model_ref m, tensor x) { return named(m, x); } -tensor patch_merging(model_ref m, tensor x, int input_resolution) { - if (x->ne[2] == 1) { - x = ggml_reshape_4d(m, x, x->ne[0], input_resolution, input_resolution, x->ne[3]); - } +tensor patch_merging(model_ref m, tensor x) { x = conv_2d_batch_norm(m["conv1"], x); x = ggml_gelu_inplace(m, x); - int c_out = int(m.weights("conv2.c.weight")->ne[0]); + int c_out_dim = is_cwhn(m) ? 0 : 3; + int c_out = int(m.weights("conv2.c.weight")->ne[c_out_dim]); int stride = (c_out == 320 || c_out == 448 || c_out == 576) ? 1 : 2; x = conv_2d_depthwise_batch_norm(m["conv2"], x, stride, 1); x = ggml_gelu_inplace(m, x); - auto [c, h, w, b] = nelements(x); + auto [w, h, c, b] = nelements_whcn(m, x); x = conv_2d_batch_norm(m["conv3"], x); - x = ggml_reshape_3d(m, x, c, w * h, b); + if (is_whcn(m)) { + x = ggml_reshape_3d(m, x, w * h, c, b); + x = ggml_cont(m, ggml_permute(m, x, 1, 0, 2, 3)); + } else { + x = ggml_reshape_3d(m, x, c, w * h, b); + } // -> always [c, wh, b] return named(m, x); } @@ -175,8 +177,10 @@ tensor tiny_vit_block( x = ggml_reshape_3d(m, x, c, spatial, b); x = ggml_add_inplace(m, x, res_x); + model_ref local_conv = m["local_conv"]; + local_conv.flags |= model_build_flag::cwhn; x = ggml_reshape_4d(m, x, c, w, h, b); - x = conv_2d_depthwise_batch_norm(m["local_conv"], x, 1, 1); + x = conv_2d_depthwise_batch_norm(local_conv, x, 1, 1); x = ggml_reshape_3d(m, x, c, spatial, b); tensor x_mlp = mlp(m["mlp"], x); @@ -189,7 +193,7 @@ tensor conv_layer(model_ref m, tensor x, tiny_vit_params::layer p) { for (int i = 0; i < p.depth; ++i) { x = mb_conv(block[i], x); } - x = patch_merging(m["downsample"], x, p.resolution); + x = patch_merging(m["downsample"], x); return named(m, x); } @@ -199,12 +203,15 @@ tensor basic_layer(model_ref m, tensor x, tiny_vit_params::layer const& p) { x = tiny_vit_block(blocks[i], x, p.resolution, p.embed_dim, p.num_heads, p.window_size); } if (p.downsample) { - x = patch_merging(m["downsample"], x, p.resolution); + x = ggml_reshape_4d(m, x, x->ne[0], p.resolution, p.resolution, x->ne[2]); + x = cwhn_to_contiguous_2d(m, x); + x = patch_merging(m["downsample"], x); } return named(m, x); } tensor tiny_vit(model_ref m, tensor x, tiny_vit_params const& p) { + x = cwhn_to_contiguous_2d(m, x); x = patch_embed(m["patch_embed"], x); x = conv_layer(m["layers.0"], x, p.layers[0]); @@ -216,10 +223,15 @@ tensor tiny_vit(model_ref m, tensor x, tiny_vit_params const& p) { x = ggml_reshape_4d(m, x, x->ne[0], 64, 64, x->ne[2]); // neck + x = cwhn_to_contiguous_2d(m, x); x = conv_2d(m["neck.0"], x); + x = contiguous_2d_to_cwhn(m, x); x = layer_norm(m["neck.1"], x); + x = cwhn_to_contiguous_2d(m, x); x = conv_2d(m["neck.2"], x, 1, 1); + x = contiguous_2d_to_cwhn(m, x); x = layer_norm(m["neck.3"], x); + return x; } @@ -418,6 +430,7 @@ auto two_way_transformer( } tensor upscale_outputs(model_ref m, tensor x) { + m.flags |= model_build_flag::cwhn; x = conv_transpose_2d(m[0], x, 2); x = layer_norm(m[1], x); x = ggml_gelu_inplace(m, x); diff --git a/src/visp/arch/mobile-sam.hpp b/src/visp/arch/mobile-sam.h similarity index 94% rename from src/visp/arch/mobile-sam.hpp rename to src/visp/arch/mobile-sam.h index f2be4b1..6e38868 100644 --- a/src/visp/arch/mobile-sam.hpp +++ b/src/visp/arch/mobile-sam.h @@ -1,8 +1,8 @@ #pragma once -#include "visp/image.hpp" -#include "visp/ml.hpp" -#include "visp/vision.hpp" +#include "visp/image.h" +#include "visp/ml.h" +#include "visp/vision.h" #include #include @@ -41,7 +41,7 @@ float resize_longest_side(i32x2 extent, int target_longest_side); tensor patch_embed(model_ref m, tensor x); tensor mb_conv(model_ref m, tensor x); -tensor patch_merging(model_ref m, tensor x, int input_resolution); +tensor patch_merging(model_ref m, tensor x); tensor mlp(model_ref m, tensor x); tensor attention_rel_bias(model_ref m, tensor x, int dim, int num_heads); tensor window_partition(model_ref m, tensor x, int window); diff --git a/src/visp/image-impl.hpp b/src/visp/image-impl.h similarity index 98% rename from src/visp/image-impl.hpp rename to src/visp/image-impl.h index 7be76fe..6072fc8 100644 --- a/src/visp/image-impl.hpp +++ b/src/visp/image-impl.h @@ -1,8 +1,8 @@ #pragma once -#include "util/math.hpp" -#include "util/string.hpp" -#include "visp/image.hpp" +#include "util/math.h" +#include "util/string.h" +#include "visp/image.h" #include #include diff --git a/src/visp/image.cpp b/src/visp/image.cpp index 235c97d..364723e 100644 --- a/src/visp/image.cpp +++ b/src/visp/image.cpp @@ -1,7 +1,7 @@ -#include "visp/image.hpp" -#include "image-impl.hpp" -#include "util/math.hpp" -#include "util/string.hpp" +#include "visp/image.h" +#include "image-impl.h" +#include "util/math.h" +#include "util/string.h" #include #include @@ -169,6 +169,10 @@ image_data image_alloc(i32x2 extent, image_format format) { return image_data{extent, format, std::unique_ptr(new uint8_t[size])}; } +void image_clear(image_span const& img) { + memset(img.data, 0, n_bytes(img)); +} + image_format image_format_from_channels(int n_channels) { switch (n_channels) { case 1: return image_format::alpha_u8; @@ -629,10 +633,13 @@ void tile_merge( coverage[i] = layout.overlap[i]; } } - float norm = float((coverage[0] + 1) * (coverage[1] + 1)); - float blend = weight > 0 ? weight / norm : 1.0f; - - dst.store(idx, dst.load(idx) + blend * tile.load(idx - beg)); + f32x4 val = tile.load(idx - beg); + if (weight > 0) { + float norm = float((coverage[0] + 1) * (coverage[1] + 1)); + float blend = weight / norm; + val = dst.load(idx) + blend * val; + } + dst.store(idx, val); } } } diff --git a/src/visp/ml.cpp b/src/visp/ml.cpp index f75eccb..dbc6a1c 100644 --- a/src/visp/ml.cpp +++ b/src/visp/ml.cpp @@ -1,6 +1,6 @@ -#include "visp/ml.hpp" -#include "visp/platform.hpp" -#include "util/string.hpp" +#include "visp/ml.h" +#include "util/string.h" +#include "visp/platform.h" #include #include @@ -33,7 +33,7 @@ bool load_ggml_backends() { auto str = dir.parent_path().u8string(); ggml_backend_load_all_from_path((char const*)str.c_str()); } - } + } return true; }(); return loaded; @@ -94,6 +94,13 @@ ggml_type backend_device::preferred_float_type() const { return GGML_TYPE_COUNT; // no preference, use float type of model weights } +tensor_data_layout backend_device::preferred_layout() const { + if (type() == backend_type::cpu) { + return tensor_data_layout::cwhn; + } + return tensor_data_layout::unknown; // no preference, keep model weight layout +} + size_t backend_device::total_memory() const { ggml_backend_dev_t dev = ggml_backend_get_device(handle.get()); size_t free, total; @@ -114,115 +121,293 @@ void backend_set_n_threads(backend_device& b, int n_threads) { } // -// model_weights +// model_build_flags -bool is_float_type(ggml_type t) { - return t != GGML_TYPE_I8 && t != GGML_TYPE_I16 && t != GGML_TYPE_I32 && t != GGML_TYPE_I64; +model_build_flags backend_default_flags(backend_type type) { + using enum model_build_flag; + switch (type) { + case backend_type::cpu: + return conv_2d_direct_cwhn | concat_n | f16_conv_transpose | window_partition; + case backend_type::gpu: return {}; + } + return {}; } -struct float_converter { - ggml_type target; - ggml_type_traits const* dst_traits = nullptr; - std::vector f32_buffer; - std::vector dst_buffer; +model_build_flags model_get_build_flags(model_file const& file) { + fixed_string<64> str; + std::string_view arch = file.arch(); + model_build_flags flags; - explicit float_converter(ggml_type target_type) : target(target_type) { - if (target != GGML_TYPE_COUNT) { - dst_traits = ggml_get_type_traits(target_type); + int64_t key = gguf_find_key(file.gguf.get(), format(str, "{}.tensor_data_layout", arch)); + if (key != -1) { + std::string_view layout = gguf_get_val_str(file.gguf.get(), key); + if (layout == "cwhn") { + flags |= model_build_flag::cwhn; } } + return flags; +} - ggml_type target_type(ggml_tensor const* t) const { - if (target == GGML_TYPE_COUNT || !is_float_type(t->type)) { - return t->type; - } - return target; +// +// model_file + +model_file model_load(char const* filepath) { + ggml_context* data_ctx; + gguf_init_params params; + params.no_alloc = false; + params.ctx = &data_ctx; + + gguf_context_ptr gguf_ctx(gguf_init_from_file(filepath, params)); + if (!gguf_ctx) { + throw except("Failed to load GGUF model: {}", filepath); } + return model_file{std::move(gguf_ctx), ggml_context_ptr(data_ctx), filepath}; +} - void const* operator()(ggml_tensor const* src, ggml_tensor const* dst) { - if (target == GGML_TYPE_COUNT || src->type == dst->type) { - return src->data; - } - ASSERT(dst->type == target); +int64_t model_file::n_tensors() const { + return gguf_get_n_tensors(gguf.get()); +} - float const* f32_data = reinterpret_cast(src->data); - if (src->type != GGML_TYPE_F32) { - if (int64_t(f32_buffer.size()) < ggml_nelements(src)) { - f32_buffer.resize(ggml_nelements(src)); - } - ggml_type_traits const* src_traits = ggml_get_type_traits(src->type); - src_traits->to_float(src->data, f32_buffer.data(), ggml_nelements(src)); - f32_data = f32_buffer.data(); - } - void const* dst_data = f32_data; - if (target != GGML_TYPE_F32) { - if (dst_buffer.size() < ggml_nbytes(dst)) { - dst_buffer.resize(ggml_nbytes(dst)); - } - dst_traits->from_float_ref(f32_data, dst_buffer.data(), ggml_nelements(dst)); - dst_data = dst_buffer.data(); +int64_t model_file::key(char const* name) const { + int64_t key_id = gguf_find_key(gguf.get(), name); + if (key_id == -1) { + throw except("Can't find key '{}' in model file {}", name, path); + } + return key_id; +} + +std::string_view model_file::get_string(char const* key_name) const { + return gguf_get_val_str(gguf.get(), key(key_name)); +} + +int model_file::get_int(char const* key_name) const { + return gguf_get_val_i32(gguf.get(), key(key_name)); +} + +std::string_view model_file::arch() const { + return get_string("general.architecture"); +} + +tensor_data_layout model_file::tensor_layout() const { + fixed_string<64> str; + int64_t key = gguf_find_key(gguf.get(), format(str, "{}.tensor_data_layout", arch())); + if (key != -1) { + std::string_view layout = gguf_get_val_str(gguf.get(), key); + if (layout == "cwhn") { + return tensor_data_layout::cwhn; + } else if (layout == "whcn") { + return tensor_data_layout::whcn; } - return dst_data; } -}; + return tensor_data_layout::unknown; +} -model_weights model_init(backend_device const& be, size_t size) { +// +// model_weights + +model_weights model_init(size_t size) { ggml_init_params params{}; params.mem_size = size * ggml_tensor_overhead(); params.no_alloc = true; ggml_context_ptr ctx(ggml_init(params)); - return model_weights{std::move(ctx), be.type(), {}, {}}; + model_weights w{}; + w.context = std::move(ctx); + w.buffer_type = backend_type::cpu; + return w; } -model_weights model_load(char const* filepath, backend_device const& backend, model_load_params p) { +bool model_allocate(model_weights& m, backend_device const& b) { + ggml_backend_buffer_ptr buffer(ggml_backend_alloc_ctx_tensors(m.context.get(), b.handle.get())); + if (!buffer) { + return false; // context contains nothing to allocate + } + m.buffer_type = b.type(); + m.extra_buffers.push_back(std::move(buffer)); + return true; +} - ggml_context* data_ctx; - gguf_init_params params; - params.no_alloc = false; - params.ctx = &data_ctx; +namespace { - gguf_context_ptr gguf_ctx(gguf_init_from_file(filepath, params)); - if (!gguf_ctx) { - throw std::runtime_error("Failed to load GGUF model"); - } - ggml_context_ptr data_ctx_ptr(data_ctx); - int64_t n_weights = gguf_get_n_tensors(gguf_ctx.get()); - - ggml_init_params model_ctx_params{}; - model_ctx_params.mem_size = (n_weights + p.n_extra_tensors) * ggml_tensor_overhead(); - model_ctx_params.no_alloc = true; - ggml_context_ptr model_ctx(ggml_init(model_ctx_params)); - - float_converter convert(p.float_type); - for (int64_t i = 0; i < gguf_get_n_tensors(gguf_ctx.get()); ++i) { - auto name = gguf_get_tensor_name(gguf_ctx.get(), i); - tensor orig = ggml_get_tensor(data_ctx, name); - tensor dup = ggml_new_tensor( - model_ctx.get(), convert.target_type(orig), GGML_MAX_DIMS, orig->ne); - ggml_set_name(dup, name); +bool is_float_type(ggml_type t) { + return t != GGML_TYPE_I8 && t != GGML_TYPE_I16 && t != GGML_TYPE_I32 && t != GGML_TYPE_I64; +} + +int64_t max_tensor_elements(ggml_context* ctx) { + int64_t result = 0; + for (ggml_tensor* t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) { + result = std::max(result, ggml_nelements(t)); } + return result; +} - ggml_backend_buffer_ptr buffer(ggml_backend_alloc_ctx_tensors(model_ctx.get(), backend)); +ggml_type detect_float_type(ggml_context* ctx) { + for (ggml_tensor* t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) { + if (is_float_type(t->type)) { + return t->type; + } + } + return GGML_TYPE_F32; +} - for (ggml_tensor* t = ggml_get_first_tensor(model_ctx.get()); t != nullptr; - t = ggml_get_next_tensor(model_ctx.get(), t)) { - tensor data_tensor = ggml_get_tensor(data_ctx, ggml_get_name(t)); - void const* data = convert(data_tensor, t); - ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t)); +template +void permute_whcn_to_cwhn(T* n, bool depthwise) { + if (depthwise) { // wh1c -> c1wh + T perm[] = {n[3], n[2], n[0], n[1]}; + std::copy(perm, perm + 4, n); + } else { + std::swap(n[0], n[2]); // -> chwn + std::swap(n[1], n[2]); // -> cwhn } - return model_weights{std::move(model_ctx), backend.type(), std::move(buffer), {}}; } -bool model_allocate(model_weights& m, backend_device const& b) { - ASSERT(m.buffer_type == b.type(), "Model weights must all be on the same backend"); +struct tensor_converter { + ggml_type src_type; + ggml_type dst_type; + ggml_backend_ptr backend; + ggml_context_ptr ctx; + ggml_cgraph* graph; + ggml_gallocr_ptr gallocr; + ggml_tensor convert_src{}; + ggml_tensor* convert_dst; - ggml_backend_buffer_ptr buffer(ggml_backend_alloc_ctx_tensors(m.context.get(), b.handle.get())); - if (!buffer) { - return false; // context contains nothing to allocate + tensor_converter(ggml_context* weights, ggml_type target_type, bool whcn_to_cwhn) + : dst_type(target_type) { + + if (dst_type == GGML_TYPE_COUNT && !whcn_to_cwhn) { + return; + } + src_type = detect_float_type(weights); + if (src_type == dst_type && !whcn_to_cwhn) { + return; + } + if (dst_type == GGML_TYPE_COUNT) { + dst_type = src_type; + } + + ggml_init_params ctx_params{ + .mem_size = ggml_tensor_overhead() + ggml_graph_overhead(), + .mem_buffer = nullptr, + .no_alloc = true}; + ctx.reset(ggml_init(ctx_params)); + + size_t max_elem = max_tensor_elements(weights); + graph = ggml_new_graph_custom(ctx.get(), 2, false); + convert_src.type = src_type; + convert_src.ne[0] = max_elem; + convert_src.nb[0] = ggml_type_size(src_type); + for (int i = 1; i < GGML_MAX_DIMS; ++i) { + convert_src.ne[i] = 1; + convert_src.nb[i] = convert_src.nb[i - 1] * convert_src.ne[i - 1]; + } + convert_dst = ggml_cast(ctx.get(), &convert_src, dst_type); + ggml_set_output(convert_dst); + ggml_build_forward_expand(graph, convert_dst); + + gallocr.reset(ggml_gallocr_new(ggml_backend_cpu_buffer_type())); + ggml_gallocr_reserve(gallocr.get(), graph); + + backend.reset(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr)); + } + + ggml_type target_type(ggml_tensor const* t) const { + if (dst_type == GGML_TYPE_COUNT || !is_float_type(t->type)) { + return t->type; + } + return dst_type; + } + + void const* operator()(ggml_tensor const* src, ggml_tensor const* dst, bool whcn_to_cwhn) { + bool need_type_conv = is_float_type(src->type) && src->type != dst_type; + if (dst_type == GGML_TYPE_COUNT || !(need_type_conv || whcn_to_cwhn)) { + return src->data; + } + ASSERT(ctx, "Weights contain tensors that would require conversion"); + + convert_src.type = src->type; + convert_src.data = src->data; + std::copy(src->ne, src->ne + GGML_MAX_DIMS, convert_src.ne); + std::copy(src->nb, src->nb + GGML_MAX_DIMS, convert_src.nb); + if (whcn_to_cwhn) { + bool depthwise = convert_src.ne[2] == 1; + permute_whcn_to_cwhn(convert_src.ne, depthwise); + permute_whcn_to_cwhn(convert_src.nb, depthwise); + } + + ASSERT(convert_dst->type == dst->type); + std::copy(dst->ne, dst->ne + GGML_MAX_DIMS, convert_dst->ne); + std::copy(dst->nb, dst->nb + GGML_MAX_DIMS, convert_dst->nb); + + bool alloc_ok = ggml_gallocr_alloc_graph(gallocr.get(), graph); + ASSERT(alloc_ok); + + ggml_backend_graph_compute(backend.get(), graph); + return convert_dst->data; + } +}; + +span find_conv2d_weight_indices(model_file const& f) { + gguf_context* gguf = f.gguf.get(); + auto name = format>("{}.conv2d_weights", f.arch()); + int64_t key = gguf_find_key(gguf, name.c_str()); + if (key != -1 && gguf_get_arr_type(gguf, key) == GGUF_TYPE_INT32) { + size_t n = gguf_get_arr_n(gguf, key); + int32_t const* a = reinterpret_cast(gguf_get_arr_data(gguf, key)); + return span(a, n); + } + return {}; +} + +} // namespace + +void model_transfer( + model_file const& file, + model_weights& weights, + backend_device const& device, + ggml_type float_type, + tensor_data_layout layout) { + + gguf_context* gguf = file.gguf.get(); + ggml_context* src_ctx = file.data.get(); + ggml_context* dst_ctx = weights.context.get(); + + tensor_data_layout file_layout = file.tensor_layout(); + bool to_cwhn = file_layout == tensor_data_layout::whcn && layout == tensor_data_layout::cwhn; + tensor_converter convert(src_ctx, float_type, to_cwhn); + // Try to find a list of tensor indices which are weights of 2D operations + span conv2d_weights = find_conv2d_weight_indices(file); + + for (int64_t i = 0, conv2d_idx = 0; i < gguf_get_n_tensors(gguf); ++i) { + auto name = gguf_get_tensor_name(gguf, i); + tensor orig = ggml_get_tensor(src_ctx, name); // TODO: don't use name lookup + auto ne = nelements(orig); + if (to_cwhn && conv2d_idx < ssize(conv2d_weights) && conv2d_weights[conv2d_idx] == i) { + permute_whcn_to_cwhn(ne.data(), ne[2] == 1); + ++conv2d_idx; + } + tensor dup = ggml_new_tensor(dst_ctx, convert.target_type(orig), GGML_MAX_DIMS, ne.data()); + ggml_set_name(dup, name); + } + + ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(dst_ctx, device); + weights.weights_buffer = ggml_backend_buffer_ptr(buffer); + weights.buffer_type = device.type(); + weights.flags = model_get_build_flags(file); + if (to_cwhn) { + weights.flags |= model_build_flag::cwhn; + } + + ggml_tensor* t = ggml_get_first_tensor(dst_ctx); + for (int i = 0, conv2d_idx = 0; t; ++i) { + tensor data_tensor = ggml_get_tensor(src_ctx, ggml_get_name(t)); + bool is_2d = conv2d_idx < int(conv2d_weights.size()) && conv2d_weights[conv2d_idx] == i; + if (is_2d) { + ++conv2d_idx; + } + void const* data = convert(data_tensor, t, is_2d && to_cwhn); + ggml_backend_tensor_set(t, data, 0, ggml_nbytes(t)); + t = ggml_get_next_tensor(dst_ctx, t); } - m.extra_buffers.push_back(std::move(buffer)); - return true; } ggml_type model_weights::float_type() const { @@ -266,27 +451,17 @@ void compute(compute_graph const& g, backend_device const& b) { // // model_ref -model_build_flags default_backend_flags(backend_type type) { - using enum model_build_flag; - switch (type) { - case backend_type::cpu: - return cwhn | conv_2d_direct | fused_batch_norm | f16_conv_transpose | window_partition; - case backend_type::gpu: return cwhn; - } - return {}; -} - model_ref::model_ref(model_weights& m) : weights_context(m.context.get()), graph_context(m.context.get()), graph(nullptr), - flags(default_backend_flags(m.buffer_type)) {} + flags(m.flags | backend_default_flags(m.buffer_type)) {} model_ref::model_ref(model_weights& m, compute_graph& g) : weights_context(m.context.get()), graph_context(g.context.get()), graph(g.graph), - flags(default_backend_flags(m.buffer_type)) {} + flags(m.flags | backend_default_flags(m.buffer_type)) {} model_ref::model_ref( ggml_context* weights_context, @@ -473,8 +648,11 @@ tensor concat(model_ref const& m, std::array src, int dim) } tensor interpolate(model_ref const& m, tensor x, i64x2 target, int32_t mode) { - return ggml_interpolate( - m, x, int(target[0]), int(target[1]), int(x->ne[2]), int(x->ne[3]), mode); + if ((m.flags & model_build_flag::cwhn) && mode == GGML_SCALE_MODE_NEAREST) { + return ggml_interpolate(m, x, x->ne[0], target[0], target[1], x->ne[3], mode); + } + // Bilinear interpolation requires WHCN layout! + return ggml_interpolate(m, x, target[0], target[1], x->ne[2], x->ne[3], mode); } } // namespace visp diff --git a/src/visp/nn.cpp b/src/visp/nn.cpp index 7dc84a2..7b6065b 100644 --- a/src/visp/nn.cpp +++ b/src/visp/nn.cpp @@ -1,18 +1,14 @@ -#include "nn.hpp" -#include "util/string.hpp" +#include "nn.h" +#include "util/string.h" namespace visp { -tensor add_bias(model_ref m, tensor x) { - if (tensor bias = m.find("bias")) { - x = ggml_add_inplace(m, x, bias); - } - return x; -} tensor linear(model_ref m, tensor x) { x = ggml_mul_mat(m, m.weights("weight"), x); - x = add_bias(m, x); + if (tensor bias = m.find("bias")) { + x = ggml_add_inplace(m, x, bias); + } return x; } @@ -31,87 +27,150 @@ tensor permute_whcn_to_cwhn(model_ref m, tensor x) { return ggml_permute(m, x, 1, 2, 0, 3); } -tensor conv_2d(model_ref m, tensor x, int stride, int pad) { - ASSERT(m.flags & model_build_flag::cwhn); +std::array nelements_whcn(model_ref const& m, tensor t) { + auto ne = nelements(t); + return (m.flags & model_build_flag::cwhn) ? std::array{ne[1], ne[2], ne[0], ne[3]} : ne; +} + +tensor cwhn_to_contiguous_2d(model_ref m, tensor x) { + if (m.flags & model_build_flag::cwhn) { + return x; // preferred 2D layout is CWHN too + } + return ggml_cont(m, permute_cwhn_to_whcn(m, x)); +} + +tensor whcn_to_contiguous_2d(model_ref m, tensor x) { + if (m.flags & model_build_flag::cwhn) { + return ggml_cont(m, permute_whcn_to_cwhn(m, x)); + } + return x; +} + +tensor contiguous_2d_to_cwhn(model_ref m, tensor x) { + if (m.flags & model_build_flag::cwhn) { + return x; // x is already CWHN + } + return ggml_cont(m, permute_whcn_to_cwhn(m, x)); +} + +tensor contiguous_2d_to_whcn(model_ref m, tensor x) { + if (m.flags & model_build_flag::cwhn) { + return ggml_cont(m, permute_cwhn_to_whcn(m, x)); + } + return x; +} + +tensor add_bias_2d(model_ref m, tensor x) { + if (tensor bias = m.find("bias")) { + if (!(m.flags & model_build_flag::cwhn)) { + bias = ggml_reshape_4d(m, bias, 1, 1, bias->ne[0], 1); + } + x = ggml_add_inplace(m, x, bias); + } + return x; +} +tensor conv_2d(model_ref m, tensor x, int stride, int pad) { tensor weight = m.weights("weight"); - if (weight->ne[1] == 1 && weight->ne[2] == 1 && stride == 1) { - auto [c, w, h, b] = nelements(x); - weight = ggml_reshape_2d(m, weight, weight->ne[0], weight->ne[3]); - x = ggml_reshape_2d(m, x, x->ne[0], w * h * b); - x = ggml_mul_mat(m, weight, x); - x = ggml_reshape_4d(m, x, weight->ne[1], w, h, b); - - } else if (m.flags & model_build_flag::conv_2d_direct) { - weight = permute_cwhn_to_whcn(m, weight); - x = permute_cwhn_to_whcn(m, x); - x = ggml_conv_2d(m, weight, x, stride, stride, pad, pad, 1, 1); - x = permute_whcn_to_cwhn(m, x); - } else { - x = permute_cwhn_to_whcn(m, x); - tensor permuted_weight = permute_cwhn_to_whcn(m, weight); - tensor cols = ggml_im2col( - m, permuted_weight, x, stride, stride, pad, pad, 1, 1, true, GGML_TYPE_F32); - tensor a = ggml_reshape_2d(m, cols, cols->ne[0], cols->ne[1] * cols->ne[2] * cols->ne[3]); - tensor b = ggml_reshape_2d( - m, weight, weight->ne[0] * weight->ne[1] * weight->ne[2], weight->ne[3]); - x = ggml_mul_mat(m, b, a); - x = ggml_reshape_4d(m, x, weight->ne[3], cols->ne[1], cols->ne[2], cols->ne[3]); + if (m.flags & model_build_flag::cwhn) { + if (weight->ne[1] == 1 && weight->ne[2] == 1 && stride == 1) { + auto [c, w, h, b] = nelements(x); + weight = ggml_reshape_2d(m, weight, weight->ne[0], weight->ne[3]); + x = ggml_reshape_2d(m, x, x->ne[0], w * h * b); + x = ggml_mul_mat(m, weight, x); + x = ggml_reshape_4d(m, x, weight->ne[1], w, h, b); + + } else if (m.flags & model_build_flag::conv_2d_direct_cwhn) { + weight = permute_cwhn_to_whcn(m, weight); + x = permute_cwhn_to_whcn(m, x); + x = ggml_conv_2d_direct(m, weight, x, stride, stride, pad, pad, 1, 1); + x = permute_whcn_to_cwhn(m, x); + + } else { + x = permute_cwhn_to_whcn(m, x); + tensor permuted_weight = permute_cwhn_to_whcn(m, weight); + tensor cols = ggml_im2col( + m, permuted_weight, x, stride, stride, pad, pad, 1, 1, true, GGML_TYPE_F32); + tensor a = ggml_reshape_2d( + m, cols, cols->ne[0], cols->ne[1] * cols->ne[2] * cols->ne[3]); + tensor b = ggml_reshape_2d( + m, weight, weight->ne[0] * weight->ne[1] * weight->ne[2], weight->ne[3]); + x = ggml_mul_mat(m, b, a); + x = ggml_reshape_4d(m, x, weight->ne[3], cols->ne[1], cols->ne[2], cols->ne[3]); + } + } else { // WHCN layout + x = ggml_conv_2d_direct(m, weight, x, stride, stride, pad, pad, 1, 1); } - x = add_bias(m, x); + x = add_bias_2d(m, x); return x; } tensor conv_2d_depthwise(model_ref m, tensor x, int stride, int pad) { - ASSERT(m.flags & model_build_flag::cwhn); - - tensor weight = ggml_permute(m, m.weights("weight"), 3, 2, 0, 1); - x = permute_cwhn_to_whcn(m, x); - x = ggml_conv_2d_dw_direct(m, weight, x, stride, stride, pad, pad, 1, 1); - x = permute_whcn_to_cwhn(m, x); + tensor weight = m.weights("weight"); - x = add_bias(m, x); + if (m.flags & model_build_flag::cwhn) { + weight = ggml_permute(m, weight, 3, 2, 0, 1); + x = permute_cwhn_to_whcn(m, x); + x = ggml_conv_2d_dw_direct(m, weight, x, stride, stride, pad, pad, 1, 1); + x = permute_whcn_to_cwhn(m, x); + } else { + x = ggml_conv_2d_dw_direct(m, weight, x, stride, stride, pad, pad, 1, 1); + } + x = add_bias_2d(m, x); return x; } tensor conv_transpose_2d(model_ref m, tensor x, int stride) { - ASSERT(m.flags & model_build_flag::cwhn); - tensor weight = m.weights("weight"); if (m.flags & model_build_flag::f16_conv_transpose) { // TODO: ggml_conv_transpose_2d_p0 expects fp16 weights (cpu backend) weight = ggml_cast(m, weight, GGML_TYPE_F16); } - x = ggml_cont(m, permute_cwhn_to_whcn(m, x)); + if (m.flags & model_build_flag::cwhn) { + x = ggml_cont(m, permute_cwhn_to_whcn(m, x)); + } x = ggml_conv_transpose_2d_p0(m, weight, x, stride); - x = ggml_cont(m, permute_whcn_to_cwhn(m, x)); - x = add_bias(m, x); + + if (m.flags & model_build_flag::cwhn) { + x = ggml_cont(m, permute_whcn_to_cwhn(m, x)); + } + x = add_bias_2d(m, x); return x; } tensor conv_2d_deform( model_ref m, tensor x, tensor weight, tensor offset, tensor mask, int stride, int pad) { - ASSERT(m.flags & model_build_flag::cwhn); - x = permute_cwhn_to_whcn(m, x); - weight = permute_cwhn_to_whcn(m, weight); - offset = permute_cwhn_to_whcn(m, offset); - if (mask) { - mask = permute_cwhn_to_whcn(m, mask); + if (m.flags & model_build_flag::cwhn) { + x = permute_cwhn_to_whcn(m, x); + weight = permute_cwhn_to_whcn(m, weight); + offset = permute_cwhn_to_whcn(m, offset); + if (mask) { + mask = permute_cwhn_to_whcn(m, mask); + } } x = ggml_conv_2d_deform(m, weight, x, offset, mask, stride, stride, pad, pad); - x = permute_whcn_to_cwhn(m, x); + + if (m.flags & model_build_flag::cwhn) { + x = permute_whcn_to_cwhn(m, x); + } return x; } tensor batch_norm_2d(model_ref m, tensor x) { - ASSERT(m.flags & model_build_flag::cwhn); + // Batch norm is expected to be have been fused into mul+add. See convert.py ASSERT(m.find("running_mean") == nullptr, "Batch norm was not fused"); ASSERT(m.find("running_var") == nullptr, "Batch norm was not fused"); - x = ggml_mul_inplace(m, x, m.weights("weight")); - x = ggml_add_inplace(m, x, m.weights("bias")); + tensor weight = m.weights("weight"); + tensor bias = m.weights("bias"); + if (!(m.flags & model_build_flag::cwhn)) { // WHCN layout + weight = ggml_reshape_4d(m, weight, 1, 1, weight->ne[0], 1); + bias = ggml_reshape_4d(m, bias, 1, 1, bias->ne[0], 1); + } + x = ggml_mul_inplace(m, x, weight); + x = ggml_add_inplace(m, x, bias); return named(m, x); } diff --git a/src/visp/nn.h b/src/visp/nn.h new file mode 100644 index 0000000..eb8c106 --- /dev/null +++ b/src/visp/nn.h @@ -0,0 +1,41 @@ +#pragma once + +#include "visp/ml.h" +#include "visp/util.h" + +// Common neural network building blocks + +namespace visp { + +tensor linear(model_ref, tensor x); +tensor layer_norm(model_ref, tensor x, float eps = 1e-5f); + +// Permute between CWHN and WHCN tensor dimension ordering. Does not rewrite tensor data. +tensor permute_cwhn_to_whcn(model_ref m, tensor x); +tensor permute_whcn_to_cwhn(model_ref m, tensor x); + +// "Contiguous 2D" refers to the layout configured in `m` model flags, ie. the preferred +// memory layout for 2D operations like convolution. +inline bool is_whcn(model_ref m) { return !(m.flags & model_build_flag::cwhn); } +inline bool is_cwhn(model_ref m) { return !!(m.flags & model_build_flag::cwhn); } + +// These functions convert between memory layouts, ie. they rewrite tensor data. +tensor cwhn_to_contiguous_2d(model_ref m, tensor x); +tensor whcn_to_contiguous_2d(model_ref m, tensor x); +tensor contiguous_2d_to_cwhn(model_ref m, tensor x); +tensor contiguous_2d_to_whcn(model_ref m, tensor x); + +// Always returns number of elements of tensor in width-height-channels-batch order, +// even if that's not how they're stored in memory. +std::array nelements_whcn(model_ref const&, tensor t); + +// 2D (convolution) functions +// Input and weight are expected to be in "contiguous 2D" layout as configured in `m`. +tensor conv_2d(model_ref m, tensor x, int stride = 1, int pad = 0); +tensor conv_2d_depthwise(model_ref m, tensor x, int stride = 1, int pad = 0); +tensor conv_2d_deform( + model_ref m, tensor x, tensor weight, tensor offset, tensor mask, int stride, int pad); +tensor conv_transpose_2d(model_ref m, tensor x, int stride); +tensor batch_norm_2d(model_ref, tensor x); + +} // namespace visp diff --git a/src/visp/nn.hpp b/src/visp/nn.hpp deleted file mode 100644 index 418c45c..0000000 --- a/src/visp/nn.hpp +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include "visp/ml.hpp" -#include "visp/util.hpp" - -// Common neural network building blocks - -namespace visp { - -tensor linear(model_ref, tensor x); -tensor layer_norm(model_ref, tensor x, float eps = 1e-5f); - -// Permute between CWHN and WHCN tensor dimension ordering. Does not rewrite tensor data. -tensor permute_cwhn_to_whcn(model_ref m, tensor x); -tensor permute_whcn_to_cwhn(model_ref m, tensor x); - -tensor conv_2d(model_ref, tensor x, int stride = 1, int pad = 0); -tensor conv_2d_depthwise(model_ref, tensor x, int stride = 1, int pad = 0); -tensor conv_2d_deform( - model_ref, tensor x, tensor weight, tensor offset, tensor mask, int stride, int pad); -tensor conv_transpose_2d(model_ref m, tensor x, int stride); - -tensor batch_norm_2d(model_ref, tensor x); - -} // namespace visp diff --git a/src/visp/platform.cpp b/src/visp/platform.cpp index a398483..eeaa28b 100644 --- a/src/visp/platform.cpp +++ b/src/visp/platform.cpp @@ -1,4 +1,4 @@ -#include "visp/platform.hpp" +#include "visp/platform.h" #ifdef _WIN32 # ifndef WIN32_LEAN_AND_MEAN diff --git a/src/visp/platform.hpp b/src/visp/platform.h similarity index 100% rename from src/visp/platform.hpp rename to src/visp/platform.h diff --git a/src/visp/vision.cpp b/src/visp/vision.cpp index 4d335ef..a743311 100644 --- a/src/visp/vision.cpp +++ b/src/visp/vision.cpp @@ -1,38 +1,38 @@ -#include "visp/vision.hpp" -#include "util/math.hpp" -#include "util/string.hpp" +#include "visp/vision.h" +#include "util/math.h" +#include "util/string.h" namespace visp { // // Mobile SAM -sam_model sam_load_model(char const* filepath, backend_device const& backend) { +sam_model sam_load_model(char const* filepath, backend_device const& dev) { sam_model model; - model.backend = &backend; - model_load_params load_params = { - .float_type = backend.preferred_float_type(), - .n_extra_tensors = 0, - }; - model.weights = model_load(filepath, backend, load_params); + model.backend = &dev; + model_file file = model_load(filepath); model.params = sam_params{}; - model.encoder = compute_graph_init(); - - model_ref m = model_ref(model.weights, model.encoder); - int res = model.params.image_size; - model.input_image = compute_graph_input(m, GGML_TYPE_F32, {3, res, res, 1}); - tensor embeds = sam_encode_image(m, model.input_image, model.params); - model.output_embed = compute_graph_output(m, embeds); - - compute_graph_allocate(model.encoder, backend); + model.weights = model_init(file.n_tensors()); + model_transfer(file, model.weights, dev, dev.preferred_float_type(), dev.preferred_layout()); return model; } -void sam_encode(sam_model& m, image_view image) { - m.image_extent = image.extent; - image_data img_data = sam_process_input(image, m.params); - transfer_to_backend(m.input_image, img_data); - compute(m.encoder, *m.backend); +void sam_encode(sam_model& model, image_view image) { + if (!model.encoder) { + model.encoder = compute_graph_init(); + model_ref m = model_ref(model.weights, model.encoder); + + int res = model.params.image_size; + model.input_image = compute_graph_input(m, GGML_TYPE_F32, {3, res, res, 1}); + tensor embeds = sam_encode_image(m, model.input_image, model.params); + model.output_embed = compute_graph_output(m, embeds); + compute_graph_allocate(model.encoder, *model.backend); + } + + model.image_extent = image.extent; + image_data img_data = sam_process_input(image, model.params); + transfer_to_backend(model.input_image, img_data); + compute(model.encoder, *model.backend); } image_data sam_compute_impl(sam_model& model, i32x2 point1, i32x2 point2) { @@ -52,6 +52,7 @@ image_data sam_compute_impl(sam_model& model, i32x2 point1, i32x2 point2) { compute_graph_allocate(model.decoder, *model.backend); } + f32x4 prompt_data = is_point ? sam_process_point(point1, model.image_extent, model.params) : sam_process_box({point1, point2}, model.image_extent, model.params); @@ -78,33 +79,33 @@ image_data sam_compute(sam_model& model, box_2d box) { // // BiRefNet -birefnet_model birefnet_load_model(char const* filepath, backend_device const& backend) { +birefnet_model birefnet_load_model(char const* filepath, backend_device const& dev) { birefnet_model model; - model.backend = &backend; - model_load_params load_params = { - .float_type = backend.preferred_float_type(), - .n_extra_tensors = swin_params::n_layers + 2 - }; - model.weights = model_load(filepath, backend, load_params); - model.params = birefnet_detect_params(model.weights); - - birefnet_buffers buffers = birefnet_precompute(model.weights, model.params); - model_allocate(model.weights, backend); - for (tensor_data const& buf : buffers) { - transfer_to_backend(buf); - } - - model.graph = compute_graph_init(6 * 1024); - model_ref m(model.weights, model.graph); - int res = model.params.image_size; - model.input = compute_graph_input(m, GGML_TYPE_F32, {3, res, res, 1}); - model.output = birefnet_predict(m, model.input, model.params); - compute_graph_allocate(model.graph, backend); - + model.backend = &dev; + model_file file = model_load(filepath); + model.params = birefnet_detect_params(file, {1024, 1024}); + model.weights = model_init(file.n_tensors()); + model_transfer(file, model.weights, dev, dev.preferred_float_type(), dev.preferred_layout()); return model; } image_data birefnet_compute(birefnet_model& model, image_view image) { + i32x2 res = birefnet_image_extent(image.extent, model.params); + if (!model.input || res != model.params.image_extent) { + model.params.image_extent = res; + model.graph = compute_graph_init(6 * 1024); + + model_ref m(model.weights, model.graph); + birefnet_buffers buffers = birefnet_precompute(m, model.params); + model.input = compute_graph_input(m, GGML_TYPE_F32, {3, res[0], res[1], 1}); + model.output = birefnet_predict(m, model.input, model.params); + + compute_graph_allocate(model.graph, *model.backend); + for (tensor_data const& buf : buffers) { + transfer_to_backend(buf); + } + } + image_data img_data = birefnet_process_input(image, model.params); transfer_to_backend(model.input, img_data); @@ -117,28 +118,28 @@ image_data birefnet_compute(birefnet_model& model, image_view image) { // // MI-GAN -migan_model migan_load_model(char const* filepath, backend_device const& backend) { +migan_model migan_load_model(char const* filepath, backend_device const& dev) { migan_model model; - model.backend = &backend; - model_load_params load_params = { - .float_type = backend.preferred_float_type(), - .n_extra_tensors = 0 - }; - model.weights = model_load(filepath, backend, load_params); - model.params = migan_detect_params(model.weights); + model.backend = &dev; + model_file file = model_load(filepath); + model.params = migan_detect_params(file); model.params.invert_mask = true; // inpaint opaque areas - int res = model.params.resolution; - - model.graph = compute_graph_init(); - model_ref m(model.weights, model.graph); - model.input = compute_graph_input(m, GGML_TYPE_F32, {4, res, res, 1}); - model.output = migan_generate(m, model.input, model.params); - compute_graph_allocate(model.graph, backend); - + model.weights = model_init(file.n_tensors()); + model_transfer(file, model.weights, dev, dev.preferred_float_type(), dev.preferred_layout()); return model; } image_data migan_compute(migan_model& model, image_view image, image_view mask) { + if (!model.graph) { + model.graph = compute_graph_init(); + model_ref m(model.weights, model.graph); + + int res = model.params.resolution; + model.input = compute_graph_input(m, GGML_TYPE_F32, {4, res, res, 1}); + model.output = migan_generate(m, model.input, model.params); + compute_graph_allocate(model.graph, *model.backend); + } + image_data input_data = migan_process_input(image, mask, model.params); transfer_to_backend(model.input, input_data); @@ -155,15 +156,13 @@ image_data migan_compute(migan_model& model, image_view image, image_view mask) constexpr int esrgan_default_tile_size = 224; -esrgan_model esrgan_load_model(char const* filepath, backend_device const& b) { +esrgan_model esrgan_load_model(char const* filepath, backend_device const& dev) { esrgan_model model; - model.backend = &b; - model_load_params load_params = { - .float_type = b.preferred_float_type(), - .n_extra_tensors = 0 - }; - model.weights = model_load(filepath, b, load_params); - model.params = esrgan_detect_params(model.weights); + model.backend = &dev; + model_file file = model_load(filepath); + model.params = esrgan_detect_params(file); + model.weights = model_init(file.n_tensors()); + model_transfer(file, model.weights, dev, dev.preferred_float_type(), dev.preferred_layout()); return model; } @@ -185,6 +184,7 @@ image_data esrgan_compute(esrgan_model& model, image_view image) { image_data input_tile = image_alloc(tiles.tile_size, image_format::rgb_f32); image_data output_tile = image_alloc(tiles_out.tile_size, image_format::rgb_f32); image_data output_image = image_alloc(image.extent * model.params.scale, image_format::rgb_f32); + image_clear(output_image); for (int t = 0; t < tiles.total(); ++t) { i32x2 tile_coord = tiles.coord(t); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 76aaade..6d06f46 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -6,8 +6,9 @@ target_sources(test-vision PRIVATE test-image.cpp ) target_include_directories(test-vision PRIVATE . ../src) -target_compile_options(test-vision PRIVATE ${VISP_WARNINGS}) -target_compile_definitions(test-vision PRIVATE ${VISP_ASSERT} ${VISP_FMT_DEFS}) +target_compile_definitions(test-vision PRIVATE ${VISP_ASSERT} ${VISP_DEFINITIONS}) +target_compile_options(test-vision PRIVATE ${VISP_WARNINGS} ${VISP_COMP_OPTIONS}) +target_link_options(test-vision PRIVATE ${VISP_LINK_OPTIONS}) target_link_libraries(test-vision PRIVATE visioncpp ${VISP_FMT_LINK}) add_test(NAME vision COMMAND test-vision -v) @@ -20,8 +21,9 @@ target_sources(test-models PRIVATE test-models.cpp ) target_include_directories(test-models PRIVATE . ../src) -target_compile_options(test-models PRIVATE ${VISP_WARNINGS}) -target_compile_definitions(test-models PRIVATE ${VISP_ASSERT} ${VISP_FMT_DEFS}) +target_compile_definitions(test-models PRIVATE ${VISP_ASSERT} ${VISP_DEFINITIONS}) +target_compile_options(test-models PRIVATE ${VISP_WARNINGS} ${VISP_COMP_OPTIONS}) +target_link_options(test-models PRIVATE ${VISP_LINK_OPTIONS}) target_link_libraries(test-models PRIVATE visioncpp ${VISP_FMT_LINK}) if(VISP_VULKAN AND NOT VISP_CI) add_test(NAME models COMMAND test-models -v) @@ -37,7 +39,9 @@ include(reference-images.cmake) add_library(vision-workbench workbench.cpp) target_include_directories(vision-workbench PRIVATE ../src) -target_compile_definitions(vision-workbench PRIVATE ${VISP_ASSERT} ${VISP_FMT_DEFS}) +target_compile_definitions(vision-workbench PRIVATE ${VISP_ASSERT} ${VISP_DEFINITIONS}) +target_compile_options(vision-workbench PRIVATE ${VISP_COMP_OPTIONS}) +target_link_options(vision-workbench PRIVATE ${VISP_LINK_OPTIONS}) target_link_libraries(vision-workbench PRIVATE visioncpp ggml ${VISP_FMT_LINK}) # @@ -46,6 +50,7 @@ target_link_libraries(vision-workbench PRIVATE visioncpp ggml ${VISP_FMT_LINK}) add_executable(vision-bench) target_sources(vision-bench PRIVATE benchmark.cpp testing.cpp) target_include_directories(vision-bench PRIVATE . ../src) -target_compile_definitions(vision-bench PRIVATE VISP_TEST_NO_MAIN ${VISP_ASSERT} ${VISP_FMT_DEFS}) -target_compile_options(vision-bench PRIVATE ${VISP_WARNINGS}) +target_compile_definitions(vision-bench PRIVATE VISP_TEST_NO_MAIN ${VISP_ASSERT} ${VISP_DEFINITIONS}) +target_compile_options(vision-bench PRIVATE ${VISP_WARNINGS} ${VISP_COMP_OPTIONS}) +target_link_options(vision-bench PRIVATE ${VISP_LINK_OPTIONS}) target_link_libraries(vision-bench PRIVATE visioncpp ${VISP_FMT_LINK}) diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp index 5474a39..a75bd13 100644 --- a/tests/benchmark.cpp +++ b/tests/benchmark.cpp @@ -1,8 +1,8 @@ -#include "testing.hpp" -#include "visp/image.hpp" -#include "visp/ml.hpp" -#include "visp/util.hpp" -#include "visp/vision.hpp" +#include "testing.h" +#include "visp/image.h" +#include "visp/ml.h" +#include "visp/util.h" +#include "visp/vision.h" #include #include @@ -114,7 +114,7 @@ bench_timings benchmark_esrgan(path model_path, backend_device& backend) { image_data input_data = image_u8_to_f32(input, image_format::rgb_f32); compute_graph graph = compute_graph_init(esrgan_estimate_graph_size(model.params)); - model_ref m(model.weights, model.graph); + model_ref m(model.weights, graph); i64x4 input_shape = {3, input.extent[0], input.extent[1], 1}; model.input = compute_graph_input(m, GGML_TYPE_F32, input_shape); model.output = esrgan_generate(m, model.input, model.params); @@ -259,7 +259,7 @@ int main(int argc, char** argv) { line, "| {: <10} | {: <30} | {: <6} | {: >11} | {: >6} |\n", "Arch", "Model", "Device", "Avg", "Dev")); printf("|:-----------|:-------------------------------|:-------|------------:|-------:|\n"); for (const auto& result : results) { - auto model = result.model.substr(std::max(int(result.model.length()) - 32, 0)); + auto model = result.model.substr(std::max(int(result.model.length()) - 30, 0)); print(format( line, "| {: <10} | {: <30} | {: <6} | {:8.1f} ms | {:6.1f} |\n", result.arch, model, result.backend, result.time.mean, result.time.stdev)); diff --git a/tests/reference-images.cmake b/tests/reference-images.cmake index ff9b6c2..d2d0a0b 100644 --- a/tests/reference-images.cmake +++ b/tests/reference-images.cmake @@ -1,4 +1,5 @@ file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/birefnet-cpu.png/c8663d4c985f94b29fcca3c3c5d2058c53447f19c521b7c5f97276cace68bb09" "tests/reference/birefnet-cpu.png" EXPECTED_HASH SHA256=c8663d4c985f94b29fcca3c3c5d2058c53447f19c521b7c5f97276cace68bb09) +file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/birefnet-dynamic.png/720bf20140f6f93c3c3953ed2e28a9cb395def8426f53c031d58a8393784227f" "tests/reference/birefnet-dynamic.png" EXPECTED_HASH SHA256=720bf20140f6f93c3c3953ed2e28a9cb395def8426f53c031d58a8393784227f) file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/birefnet-gpu.png/c8663d4c985f94b29fcca3c3c5d2058c53447f19c521b7c5f97276cace68bb09" "tests/reference/birefnet-gpu.png" EXPECTED_HASH SHA256=c8663d4c985f94b29fcca3c3c5d2058c53447f19c521b7c5f97276cace68bb09) file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/esrgan-cpu.png/481dcc0eb617feb9f8f7403ce179e77e2eba2c7a067f4a1ea90e0fb47083d814" "tests/reference/esrgan-cpu.png" EXPECTED_HASH SHA256=481dcc0eb617feb9f8f7403ce179e77e2eba2c7a067f4a1ea90e0fb47083d814) file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/esrgan-gpu.png/a8bfab0e07aeca16b737872bb3dbbe0e6b76cfff5616d2f02f2b0465cc7a0937" "tests/reference/esrgan-gpu.png" EXPECTED_HASH SHA256=a8bfab0e07aeca16b737872bb3dbbe0e6b76cfff5616d2f02f2b0465cc7a0937) @@ -7,4 +8,4 @@ file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/migan-gpu file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/mobile_sam-box-cpu.png/1a4d1a6a45861c8481e55d215b0d8a57c7fd7cb29c0698fa1fad0e96b59c13e8" "tests/reference/mobile_sam-box-cpu.png" EXPECTED_HASH SHA256=1a4d1a6a45861c8481e55d215b0d8a57c7fd7cb29c0698fa1fad0e96b59c13e8) file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/mobile_sam-box-gpu.png/51e1a3ac5ba467152b1858d98d4522f401e0d7104069e915e87df6df5993877c" "tests/reference/mobile_sam-box-gpu.png" EXPECTED_HASH SHA256=51e1a3ac5ba467152b1858d98d4522f401e0d7104069e915e87df6df5993877c) file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/mobile_sam-point-cpu.png/1abe24d0d0e5d5a703ab13a1c7dc7e1e24dd4e239dbee54ce70cac3edeccaff3" "tests/reference/mobile_sam-point-cpu.png" EXPECTED_HASH SHA256=1abe24d0d0e5d5a703ab13a1c7dc7e1e24dd4e239dbee54ce70cac3edeccaff3) -file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/mobile_sam-point-gpu.png/2016069b41f87f6958eaffe66baf686660eae72acaa4cff2febb2d6a8d170912" "tests/reference/mobile_sam-point-gpu.png" EXPECTED_HASH SHA256=2016069b41f87f6958eaffe66baf686660eae72acaa4cff2febb2d6a8d170912) +file(DOWNLOAD "https://lfs.interstice.cloud/vision.cpp/tests/reference/mobile_sam-point-gpu.png/2016069b41f87f6958eaffe66baf686660eae72acaa4cff2febb2d6a8d170912" "tests/reference/mobile_sam-point-gpu.png" EXPECTED_HASH SHA256=2016069b41f87f6958eaffe66baf686660eae72acaa4cff2febb2d6a8d170912) \ No newline at end of file diff --git a/tests/test-image.cpp b/tests/test-image.cpp index b3eaa18..25fc10b 100644 --- a/tests/test-image.cpp +++ b/tests/test-image.cpp @@ -1,7 +1,7 @@ -#include "testing.hpp" -#include "visp/image-impl.hpp" -#include "visp/image.hpp" -#include "visp/util.hpp" +#include "testing.h" +#include "visp/image-impl.h" +#include "visp/image.h" +#include "visp/util.h" #include #include diff --git a/tests/test-models.cpp b/tests/test-models.cpp index 531c5ac..3f7b803 100644 --- a/tests/test-models.cpp +++ b/tests/test-models.cpp @@ -1,7 +1,7 @@ -#include "util/string.hpp" -#include "visp/vision.hpp" +#include "util/string.h" +#include "visp/vision.h" -#include "testing.hpp" +#include "testing.h" namespace visp { @@ -47,10 +47,29 @@ VISP_BACKEND_TEST(test_birefnet)(backend_type bt) { image_data input = image_load(input_path.string().c_str()); image_data output = birefnet_compute(model, input); - float tolerance = bt == backend_type::cpu ? 0.01f : 0.3f; // TODO: GPU is non-deterministic + float tolerance = bt == backend_type::cpu ? 0.01f : 0.015f; compare_images(name, output, tolerance); } +VISP_TEST(test_birefnet_dynamic) { + path model_path = test_dir().models / "BiRefNet-dynamic-F16.gguf"; + if (!exists(model_path) || !backend_is_available(backend_type::gpu)) { + throw test_skip{"Model not available"}; // it's a large model + } + // Test using 2 images with different resolutions one after the other + path input_path1 = test_dir().input / "cat-and-hat.jpg"; + path input_path2 = test_dir().input / "wardrobe.jpg"; + + backend_device b = backend_init(backend_type::gpu); + birefnet_model model = birefnet_load_model(model_path.string().c_str(), b); + image_data input1 = image_load(input_path1.string().c_str()); + image_data input2 = image_load(input_path2.string().c_str()); + image_data output1 = birefnet_compute(model, input1); + image_data output2 = birefnet_compute(model, input2); + + compare_images("birefnet-dynamic.png", output2, 0.015f); +} + VISP_BACKEND_TEST(test_migan)(backend_type bt) { path model_path = test_dir().models / "MIGAN-512-places2-F16.gguf"; path image_path = test_dir().input / "bench-image.jpg"; diff --git a/tests/test_birefnet.py b/tests/test_birefnet.py index 33dcf8c..353bb0d 100644 --- a/tests/test_birefnet.py +++ b/tests/test_birefnet.py @@ -15,6 +15,8 @@ torch.set_printoptions(precision=3, linewidth=100, edgeitems=6, sci_mode=False) +nhwc_layout = dict(memory_layout="nhwc") + class WindowAttention(nn.Module): def __init__( @@ -741,7 +743,7 @@ def test_encode(): state.update({f"input{i}": to_nhwc(xs[i]) for i in range(4)}) state.update({f"input_low{i}": to_nhwc(xs_low[i]) for i in range(4)}) - results = workbench.invoke_test("biref_encode", x, state) + results = workbench.invoke_test("biref_encode", x, state, nhwc_layout) for i, e in enumerate(expected): result = to_nchw(results[i]) @@ -755,28 +757,37 @@ def test_encode(): @pytest.mark.parametrize("scenario", ["small", "large"]) +@pytest.mark.parametrize("memory_layout", ["nchw", "nhwc"]) @pytest.mark.parametrize("backend", ["cpu", "vulkan"]) -def test_conv_2d_deform(scenario: str, backend: str): +def test_conv_2d_deform(scenario: str, memory_layout: str, backend: str): + torch.manual_seed(42) + if memory_layout == "nhwc" and backend == "vulkan": + pytest.skip("conv_2d_deform with nhwc layout is not supported on Vulkan") + w, h, c_in, c_out, k = { "small": (4, 4, 5, 2, 3), - "large": (42, 38, 82, 32, 3), + "large": (49, 38, 81, 17, 7), }[scenario] - x = input_tensor(1, c_in, h, w) - weight = input_tensor(c_out, c_in, k, k) + x = torch.rand(1, c_in, h, w) - 0.5 + weight = torch.rand(c_out, c_in, k, k) - 0.5 offset = 1.0 - input_tensor(1, 2 * k * k, h, w) mask = torch.rand(1, k * k, h, w) - expected = torchvision.ops.deform_conv2d(x, offset, weight, mask=mask, padding=(1, 1)) + expected = torchvision.ops.deform_conv2d(x, offset, weight, mask=mask, padding=(k // 2, k // 2)) - x = to_nhwc(x) state = { - "weight": to_nhwc(weight), - "offset": to_nhwc(offset), - "mask": to_nhwc(mask), + "weight": weight, + "offset": offset, + "mask": mask, } - result = workbench.invoke_test("conv_2d_deform", x, state, backend=backend) - result = to_nchw(result) + if memory_layout == "nhwc": + x = to_nhwc(x) + state = {k: to_nhwc(v) for k, v in state.items()} + params = dict(memory_layout=memory_layout, padding=k // 2) + result = workbench.invoke_test("conv_2d_deform", x, state, params, backend=backend) + if memory_layout == "nhwc": + result = to_nchw(result) - assert torch.allclose(result, expected, atol=1e-2 if backend == "vulkan" else 1e-5) + assert torch.allclose(result, expected, atol=0.1 if backend == "vulkan" else 0.001) class DeformableConv2d(nn.Module): @@ -862,7 +873,7 @@ def test_deformable_conv_2d(): state = convert_to_nhwc(state, key="conv") state = {shorten_weight_name(k): v for k, v in state.items()} x = to_nhwc(x) - result = workbench.invoke_test("biref_deformable_conv_2d", x, state) + result = workbench.invoke_test("biref_deformable_conv_2d", x, state, nhwc_layout) result = to_nchw(result) assert torch.allclose(result, expected) @@ -907,10 +918,8 @@ def test_global_avg_pool(backend: str): state = fuse_all_conv_2d_batch_norm(state, "", "1", "2") state = convert_to_nhwc(state, key="1.weight") - for k, v in state.items(): - print(f"{k}: {v.shape}") x = to_nhwc(x) - result = workbench.invoke_test("biref_global_avg_pool", x, state, backend=backend) + result = workbench.invoke_test("biref_global_avg_pool", x, state, nhwc_layout, backend=backend) result = to_nchw(result) assert torch.allclose(result, expected) @@ -1002,7 +1011,7 @@ def test_aspp_deformable(): state = {shorten_weight_name(k): v for k, v in state.items()} x = to_nhwc(x) - result = workbench.invoke_test("biref_aspp_deformable", x, state) + result = workbench.invoke_test("biref_aspp_deformable", x, state, nhwc_layout) result = to_nchw(result) assert torch.allclose(result, expected) @@ -1048,7 +1057,7 @@ def test_basic_dec_blk(): state = {shorten_weight_name(k): v for k, v in state.items()} x = to_nhwc(x) - result = workbench.invoke_test("biref_basic_dec_blk", x, state) + result = workbench.invoke_test("biref_basic_dec_blk", x, state, nhwc_layout) result = to_nchw(result) assert torch.allclose(result, expected) @@ -1300,7 +1309,7 @@ def test_decoder(): state["x3"] = to_nhwc(x3) state["x4"] = to_nhwc(x4) - result = workbench.invoke_test("biref_decode", x, state) + result = workbench.invoke_test("biref_decode", x, state, nhwc_layout) result = to_nchw(result) assert torch.allclose(result, expected) diff --git a/tests/test_esrgan.py b/tests/test_esrgan.py index 42f0a2e..92893f7 100644 --- a/tests/test_esrgan.py +++ b/tests/test_esrgan.py @@ -3,8 +3,7 @@ from torch import nn from . import workbench -from .workbench import to_nhwc, to_nchw, convert_to_nhwc -from .workbench import input_tensor, generate_state +from .workbench import input_tensor, generate_state, to_nhwc, to_nchw torch.set_printoptions(precision=3, sci_mode=False) @@ -66,11 +65,7 @@ def test_upconv(): x = input_tensor(1, 3, 2, 2) expected = block(x) - x = to_nhwc(x) - state = convert_to_nhwc(state, "1.") result = workbench.invoke_test("esrgan_upconv", x, state) - result = to_nchw(result) - assert torch.allclose(result, expected) @@ -107,11 +102,7 @@ def test_residual_dense_block(): x = 0.1 * (input_tensor(1, 8, 6, 6) - 0.5) expected = block(x) - x = to_nhwc(x) - state = convert_to_nhwc(state, "conv") result = workbench.invoke_test("esrgan_residual_dense_block", x, state) - result = to_nchw(result) - assert torch.allclose(result, expected) @@ -141,12 +132,7 @@ def test_rrdb(): x = 0.1 * input_tensor(1, 8, 6, 6) expected = block(x) - x = to_nhwc(x) - state = convert_to_nhwc(state, "conv") - result = to_nhwc(torch.zeros_like(expected)) result = workbench.invoke_test("esrgan_rrdb", x, state) - result = to_nchw(result) - assert torch.allclose(result, expected, atol=1e-5) @@ -246,7 +232,6 @@ def test_rrdbnet(): expected = model(x) x = to_nhwc(x) - state = convert_to_nhwc(state, ".") result = workbench.invoke_test("esrgan_rrdbnet", x, state) result = to_nchw(result) diff --git a/tests/test_migan.py b/tests/test_migan.py index 9b89131..a717615 100644 --- a/tests/test_migan.py +++ b/tests/test_migan.py @@ -10,6 +10,8 @@ torch.set_printoptions(precision=3, sci_mode=False) +nhwc_layout = dict(memory_layout="nhwc") + class lrelu_agc: def __init__(self, alpha=0.2, gain=1, clamp=None): @@ -89,9 +91,7 @@ def __init__(self, in_channels): stride=2, ) f = setup_filter([1, 3, 3, 1], gain=1) - self.filter.weight = nn.Parameter( - f.repeat([*self.filter.weight.shape[:2], 1, 1]) - ) + self.filter.weight = nn.Parameter(f.repeat([*self.filter.weight.shape[:2], 1, 1])) def forward(self, x): x = self.filter(x) @@ -106,7 +106,7 @@ def test_downsample2d(): state = convert_to_nhwc(state, key="filter.") x = to_nhwc(x) - result = workbench.invoke_test("migan_downsample_2d", x, state) + result = workbench.invoke_test("migan_downsample_2d", x, state, nhwc_layout) result = to_nchw(result) assert torch.allclose(result, expected) @@ -118,9 +118,7 @@ def __init__(self, in_channels, resolution=None): self.nearest_up = nn.Upsample(scale_factor=2, mode="nearest") w = torch.tensor([[1.0, 0.0], [0.0, 0.0]], dtype=torch.float32) assert resolution is not None - self.register_buffer( - "filter_const", w.repeat(1, 1, resolution // 2, resolution // 2) - ) + self.register_buffer("filter_const", w.repeat(1, 1, resolution // 2, resolution // 2)) self.filter = nn.Conv2d( in_channels=in_channels, @@ -131,9 +129,7 @@ def __init__(self, in_channels, resolution=None): ) f = setup_filter([1, 3, 3, 1], gain=4) - self.filter.weight = nn.Parameter( - f.repeat([*self.filter.weight.shape[:2], 1, 1]) - ) + self.filter.weight = nn.Parameter(f.repeat([*self.filter.weight.shape[:2], 1, 1])) def forward(self, x): x = self.nearest_up(x) @@ -151,14 +147,13 @@ def test_upsample2d(): state = convert_to_nhwc(state, key="filter.") x = to_nhwc(x) - result = workbench.invoke_test("migan_upsample_2d", x, state) + result = workbench.invoke_test("migan_upsample_2d", x, state, nhwc_layout) result = to_nchw(result) assert torch.allclose(result, expected) class SeparableConv2d(nn.Module): - def __init__( self, in_channels, @@ -242,7 +237,7 @@ def test_separable_conv2d(): state = convert_to_nhwc(state, key="conv") state["noise_strength"] = torch.tensor([0.5]) x = to_nhwc(x) - result = workbench.invoke_test("migan_separable_conv_2d", x, state) + result = workbench.invoke_test("migan_separable_conv_2d", x, state, nhwc_layout) result = to_nchw(result) assert torch.allclose(result, expected) @@ -295,16 +290,12 @@ def __init__( self.encode_res = [2**i for i in range(log2res, 1, -1)] self.ic_n = ic_n - for idx, (resi, resj) in enumerate( - zip(self.encode_res[:-1], self.encode_res[1:]) - ): + for idx, (resi, resj) in enumerate(zip(self.encode_res[:-1], self.encode_res[1:])): hidden_ch_i = min(ch_base // resi, ch_max) hidden_ch_j = min(ch_base // resj, ch_max) if idx == 0: - block = EncoderBlock( - hidden_ch_i, hidden_ch_j, rgb_n=ic_n, activation=activation - ) + block = EncoderBlock(hidden_ch_i, hidden_ch_j, rgb_n=ic_n, activation=activation) else: block = EncoderBlock(hidden_ch_i, hidden_ch_j, activation=activation) @@ -342,7 +333,7 @@ def test_encoder(): if "noise_strength" in k: state[k] = torch.tensor([0.5]) x = to_nhwc(x) - result = workbench.invoke_test("migan_encoder", x, state) + result = workbench.invoke_test("migan_encoder", x, state, nhwc_layout) result = to_nchw(result) assert torch.allclose(result, expected) @@ -454,9 +445,7 @@ def __init__( self.block_res = block_res hidden_ch = min(ch_base // block_res[0], ch_max) - self.b4 = SynthesisBlockFirst( - hidden_ch, resolution=4, rgb_n=rgb_n, activation=activation - ) + self.b4 = SynthesisBlockFirst(hidden_ch, resolution=4, rgb_n=rgb_n, activation=activation) for resi, resj in zip(block_res[:-1], block_res[1:]): hidden_ch_i = min(ch_base // resi, ch_max) @@ -499,7 +488,7 @@ def test_synthesis(): state[k] = torch.tensor([0.5]) x = to_nhwc(x) state.update({f"feat{k}": to_nhwc(v) for k, v in enc_feats.items()}) - result = workbench.invoke_test("migan_synthesis", x, state) + result = workbench.invoke_test("migan_synthesis", x, state, nhwc_layout) result = to_nchw(result) assert torch.allclose(result, expected) diff --git a/tests/test_mobile_sam.py b/tests/test_mobile_sam.py index cb93313..6bcc090 100644 --- a/tests/test_mobile_sam.py +++ b/tests/test_mobile_sam.py @@ -10,6 +10,7 @@ torch.set_printoptions(precision=2, linewidth=100, sci_mode=False) +nhwc_layout = dict(memory_layout="nhwc") # # Image Encoder @@ -49,7 +50,7 @@ def test_conv_2d_batch_norm(bias: bool): state = fuse_all_conv_2d_batch_norm(state) state = convert_to_nhwc(state) x = to_nhwc(x) - result = workbench.invoke_test("sam_conv_2d_batch_norm", x, state) + result = workbench.invoke_test("sam_conv_2d_batch_norm", x, state, nhwc_layout) result = to_nchw(result) assert torch.allclose(result, expected) @@ -94,7 +95,7 @@ def test_patch_embed(): convert_to_nhwc(state) x = to_nhwc(x) result = to_nhwc(torch.zeros_like(expected)) - result = workbench.invoke_test("sam_patch_embed", x, state) + result = workbench.invoke_test("sam_patch_embed", x, state, nhwc_layout) result = to_nchw(result) assert torch.allclose(result, expected, rtol=0.001, atol=0.02) @@ -126,7 +127,7 @@ def test_layer_norm_2d(): x = to_nhwc(x) result = to_nhwc(torch.zeros_like(expected)) - result = workbench.invoke_test("layer_norm", x, state) + result = workbench.invoke_test("layer_norm", x, state, nhwc_layout) result = to_nchw(result) assert torch.allclose(result, expected, rtol=0.001, atol=0.02) @@ -188,7 +189,7 @@ def test_mb_conv(): state = fuse_all_conv_2d_batch_norm(state) convert_to_nhwc(state) x = to_nhwc(x) - result = workbench.invoke_test("sam_mb_conv", x, state) + result = workbench.invoke_test("sam_mb_conv", x, state, nhwc_layout) result = to_nchw(result) # precision: ggml_gelu uses fp16 look-up table & tanh approximation @@ -239,7 +240,7 @@ def test_patch_merging(): state = fuse_all_conv_2d_batch_norm(state) convert_to_nhwc(state) x = to_nhwc(x) - result = workbench.invoke_test("sam_patch_merging", x, state) + result = workbench.invoke_test("sam_patch_merging", x, state, nhwc_layout) result = result.transpose(1, 2).reshape_as(expected) # precision: ggml_gelu uses fp16 look-up table & tanh approximation @@ -492,7 +493,7 @@ def test_tiny_vit_block(): ] state = fuse_all_conv_2d_batch_norm(state) state = convert_to_nhwc(state) - result = workbench.invoke_test("sam_tiny_vit_block", x, state) + result = workbench.invoke_test("sam_tiny_vit_block", x, state, nhwc_layout) assert torch.allclose(result, expected, rtol=0.001, atol=0.02) @@ -1253,7 +1254,7 @@ def test_two_way_transformer(): state["input_image_pe"] = to_nhwc(image_pe) state["input_point_embedding"] = point_embedding result_queries, result_keys = workbench.invoke_test( - "sam_two_way_transformer", image_embedding, state + "sam_two_way_transformer", image_embedding, state, nhwc_layout ) assert torch.allclose(result_queries, expected_queries, atol=1e-6, rtol=1e-4) @@ -1321,7 +1322,7 @@ def test_output_upscaling(): expected = upscaling(x) x = to_nhwc(x) - result = workbench.invoke_test("sam_output_upscaling", x, state, backend="vulkan") + result = workbench.invoke_test("sam_output_upscaling", x, state, nhwc_layout, backend="vulkan") result = to_nchw(result) assert torch.allclose(result, expected, atol=1e-4, rtol=1e-2) # fp16 weights @@ -1460,7 +1461,7 @@ def test_predict_masks(): state["input_dense_prompt"] = to_nhwc(dense_prompt_embeddings) result_masks = torch.zeros_like(expected_masks).contiguous() result_masks, result_iou_pred = workbench.invoke_test( - "sam_predict_masks", image_embeddings, state, backend="vulkan" + "sam_predict_masks", image_embeddings, state, nhwc_layout, backend="vulkan" ) assert torch.allclose(result_masks, expected_masks, rtol=1e-2, atol=1e-2) diff --git a/tests/testing.cpp b/tests/testing.cpp index 7949e90..d92c327 100644 --- a/tests/testing.cpp +++ b/tests/testing.cpp @@ -1,5 +1,5 @@ -#include "testing.hpp" -#include "visp/ml.hpp" +#include "testing.h" +#include "visp/ml.h" #include #include @@ -23,6 +23,7 @@ int main(int argc, char** argv) { int passed = 0; int failed = 0; int errors = 0; + int skipped = 0; std::string_view filter; bool exclude_gpu = false; @@ -61,7 +62,7 @@ int main(int argc, char** argv) { } } catch (const visp::test_failure& e) { ++failed; - printf(" %s\n", "\033[31mFAILED\033[0m"); + printf("%s %s\n", verbose ? "" : name, "\033[31mFAILED\033[0m"); printf(" \033[90m%s:%d:\033[0m Assertion failed\n", e.file, e.line); printf(" \033[93m%s\033[0m\n", e.condition); if (e.eval) { @@ -70,9 +71,14 @@ int main(int argc, char** argv) { if (!visp::extra_info.empty()) { printf(" %s\n", visp::extra_info.c_str()); } + } catch (const visp::test_skip&) { + ++skipped; + if (verbose) { + printf(" %s\n", "\033[33mSKIPPED\033[0m"); + } } catch (const std::exception& e) { ++errors; - printf(" %s\n", "\033[31mERROR\033[0m"); + printf("%s %s\n", verbose ? "" : name, "\033[31mERROR\033[0m"); printf(" \033[90m%s:%d:\033[0m Unhandled exception\n", test.file, test.line); printf(" \033[93m%s\033[0m\n", e.what()); } @@ -107,6 +113,9 @@ int main(int argc, char** argv) { if (errors > 0) { printf("\033[31m%d errors, ", errors); } + if (skipped > 0) { + printf("\033[33m%d skipped, ", skipped); + } printf("\033[92m%d passed %sin %lldms\033[0m\n", passed, color, (long long)duration); return (failed > 0 || errors > 0) ? 1 : 0; diff --git a/tests/testing.hpp b/tests/testing.h similarity index 96% rename from tests/testing.hpp rename to tests/testing.h index b074cd6..f9d91ae 100644 --- a/tests/testing.hpp +++ b/tests/testing.h @@ -1,6 +1,6 @@ #pragma once -#include "util/string.hpp" +#include "util/string.h" #include #include @@ -56,6 +56,11 @@ struct test_directories { test_directories const& test_dir(); +// Use `throw test_skip{"reason"}` in a test case to skip it without failing +struct test_skip { + char const* reason = nullptr; +}; + float& test_tolerance_value(); struct test_with_tolerance { diff --git a/tests/workbench.cpp b/tests/workbench.cpp index 393ac13..f31e83d 100644 --- a/tests/workbench.cpp +++ b/tests/workbench.cpp @@ -1,9 +1,9 @@ -#include "util/string.hpp" -#include "visp/arch/birefnet.hpp" -#include "visp/arch/esrgan.hpp" -#include "visp/arch/migan.hpp" -#include "visp/arch/mobile-sam.hpp" -#include "visp/nn.hpp" +#include "util/string.h" +#include "visp/arch/birefnet.h" +#include "visp/arch/esrgan.h" +#include "visp/arch/migan.h" +#include "visp/arch/mobile-sam.h" +#include "visp/nn.h" #include #include @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -95,7 +96,8 @@ DEF(conv_2d_deform)(model_ref m, span input, param_dict const& p) { tensor weight = m.weights("weight"); tensor offset = m.weights("offset"); tensor mask = m.find("mask"); - return {conv_2d_deform(m, input[0], weight, offset, mask, 1, 1)}; + int padding = p.get("padding", 1); + return {conv_2d_deform(m, input[0], weight, offset, mask, 1, padding)}; } DEF(batch_norm_2d)(model_ref m, span input, param_dict const& p) { @@ -130,7 +132,7 @@ DEF(sam_mb_conv)(model_ref m, span input, param_dict const& p) { } DEF(sam_patch_merging)(model_ref m, span input, param_dict const& p) { - return {sam::patch_merging(m, input[0], 32)}; + return {sam::patch_merging(m, input[0])}; } DEF(sam_mlp)(model_ref m, span input, param_dict const& p) { @@ -239,8 +241,8 @@ DEF(biref_relative_position_index)(model_ref m, span input, param_dict c DEF(biref_window_attention)(model_ref m, span input, param_dict const& p) { int window_size = 3; tensor mask = m.find("mask"); - auto rel_pos_index = birefnet::create_relative_position_index(m.weights_context, window_size); - ggml_backend_alloc_ctx_tensors(m.weights_context, workbench_backend()); + auto rel_pos_index = birefnet::create_relative_position_index(m, window_size); + ggml_backend_alloc_ctx_tensors(m, workbench_backend()); transfer_to_backend(rel_pos_index); return {birefnet::window_attention(m, input[0], mask, 2, window_size)}; } @@ -253,8 +255,8 @@ DEF(biref_swin_block)(model_ref m, span input, param_dict const& p) { block.h = 6; block.shift = 0; tensor mask = m.find("mask"); - auto rel_pos_index = birefnet::create_relative_position_index(m.weights_context, 3); - ggml_backend_alloc_ctx_tensors(m.weights_context, workbench_backend()); + auto rel_pos_index = birefnet::create_relative_position_index(m, 3); + ggml_backend_alloc_ctx_tensors(m, workbench_backend()); transfer_to_backend(rel_pos_index); return {birefnet::swin_block(m, input[0], mask, block)}; } @@ -275,9 +277,11 @@ DEF(biref_swin_layer)(model_ref m, span input, param_dict const& p) { layer.n_heads = 2; layer.n_features = 8; layer.downsample = true; - auto rel_pos_index = birefnet::create_relative_position_index(m.weights_context, 3); - ggml_backend_alloc_ctx_tensors(m.weights_context, workbench_backend()); + auto rel_pos_index = birefnet::create_relative_position_index(m, 3); + auto attn_mask = birefnet::create_attention_mask(m, 6, 6, 3); + ggml_backend_alloc_ctx_tensors(m, workbench_backend()); transfer_to_backend(rel_pos_index); + transfer_to_backend(attn_mask); auto result = birefnet::swin_layer(m, input[0], 6, 6, layer, 3); ASSERT(result.w_down == 3 && result.h_down == 3); return {result.x_down}; @@ -293,11 +297,11 @@ DEF(biref_swin_transformer)(model_ref m, span input, param_dict const& p swin_layer_t{2, 4, 8 * 4, true}, swin_layer_t{2, 2, 8 * 8, false}, }}; - auto rel_pos_index = birefnet::create_relative_position_index(m.weights_context, 3); + auto rel_pos_index = birefnet::create_relative_position_index(m, 3); auto attn_masks = std::array{ - birefnet::create_attention_mask(m.weights_context, 8, 8, 3), birefnet::create_attention_mask(m.weights_context, 4, 4, 3), - birefnet::create_attention_mask(m.weights_context, 2, 2, 3), birefnet::create_attention_mask(m.weights_context, 1, 1, 3)}; - ggml_backend_alloc_ctx_tensors(m.weights_context, workbench_backend()); + birefnet::create_attention_mask(m, 8, 8, 3), birefnet::create_attention_mask(m, 4, 4, 3), + birefnet::create_attention_mask(m, 2, 2, 3), birefnet::create_attention_mask(m, 1, 1, 3)}; + ggml_backend_alloc_ctx_tensors(m, workbench_backend()); transfer_to_backend(rel_pos_index); for (auto&& attn_mask : attn_masks) { transfer_to_backend(attn_mask); @@ -520,7 +524,8 @@ void workbench_run( workbench& w = get_workbench(); w.current_backend = backend_init(backend_type); - model_weights weights = model_init(w.current_backend, tensors.size() + 10); + model_weights weights = model_init(tensors.size() + 10); + weights.buffer_type = backend_type; compute_graph graph = compute_graph_init(1024); model_ref m(weights, graph); @@ -541,8 +546,12 @@ void workbench_run( } param_dict test_params = build_dict(params); - test_case const& test = workbench_find_test(test_name); + std::string_view memory_layout = test_params.get("memory_layout", "whcn"); + if (memory_layout == "cwhn" || memory_layout == "nhwc") { + m.flags |= model_build_flag::cwhn; + } + test_case const& test = workbench_find_test(test_name); std::vector outputs = test.func(m, inputs, test_params); for (tensor& out : outputs) { out = compute_graph_output(m, ggml_cont(m, out));