diff --git a/common.hpp b/common.hpp
index d9c823df0..8b67815cd 100644
--- a/common.hpp
+++ b/common.hpp
@@ -3,6 +3,10 @@
 
 #include "ggml_extend.hpp"
 
+#ifdef SD_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
 class DownSampleBlock : public GGMLBlock {
 protected:
     int channels;
@@ -248,9 +252,6 @@ class FeedForward : public GGMLBlock {
         float scale         = 1.f;
         if (precision_fix) {
             scale = 1.f / 128.f;
-#ifdef SD_USE_VULKAN
-            force_prec_f32 = true;
-#endif
         }
         // The purpose of the scale here is to prevent NaN issues in certain situations.
         // For example, when using Vulkan without enabling force_prec_f32,
@@ -264,6 +265,11 @@ class FeedForward : public GGMLBlock {
 
         auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
         auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
+        #ifdef SD_USE_VULKAN
+            if(ggml_backend_is_vk(ctx->backend)){
+                net_2->set_force_prec_f32(true);
+            }
+        #endif
 
         x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
         x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
diff --git a/conditioner.hpp b/conditioner.hpp
index a4e84aa3b..238c28c05 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -2,8 +2,11 @@
 #define __CONDITIONER_HPP__
 
 #include "clip.hpp"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
 #include "llm.hpp"
 #include "t5.hpp"
+#include "util.h"
 
 struct SDCondition {
     struct ggml_tensor* c_crossattn = nullptr;  // aka context
@@ -62,7 +65,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     std::vector<uint8_t> token_embed_custom;
     std::map<std::string, std::pair<int, int>> embedding_pos_map;
 
-    FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
+    FrozenCLIPEmbedderWithCustomWords(std::vector<ggml_backend_t> backends,
                                       bool offload_params_to_cpu,
                                       const String2TensorStorage& tensor_storage_map,
                                       const std::map<std::string, std::string>& orig_embedding_map,
@@ -76,13 +79,27 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             tokenizer.add_special_token(name);
         }
         bool force_clip_f32 = !embedding_map.empty();
+
+        ggml_backend_t clip_backend = backends[0];
+
         if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
+            LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend));
+            text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
         } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
+            LOG_INFO("CLIP-H: using %s backend", ggml_backend_name(clip_backend));
+            text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
         } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
+            ggml_backend_t clip_g_backend = clip_backend;
+            if (backends.size() >= 2){
+                clip_g_backend = backends[1];
+                if (backends.size() > 2) {
+                    LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
+                }
+            }
+            LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend));
+            LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend));
+            text_model  = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
         }
     }
 
@@ -702,13 +719,29 @@ struct SD3CLIPEmbedder : public Conditioner {
     std::shared_ptr<CLIPTextModelRunner> clip_g;
     std::shared_ptr<T5Runner> t5;
 
-    SD3CLIPEmbedder(ggml_backend_t backend,
+    SD3CLIPEmbedder(std::vector<ggml_backend_t> backends,
                     bool offload_params_to_cpu,
                     const String2TensorStorage& tensor_storage_map = {})
         : clip_g_tokenizer(0) {
         bool use_clip_l = false;
         bool use_clip_g = false;
         bool use_t5     = false;
+
+        ggml_backend_t clip_l_backend, clip_g_backend, t5_backend;
+        if (backends.size() == 1) {
+            clip_l_backend = clip_g_backend = t5_backend = backends[0];
+        } else if (backends.size() == 2) {
+            clip_l_backend = clip_g_backend = backends[0];
+            t5_backend = backends[1];
+        } else if (backends.size() >= 3) {
+            clip_l_backend = backends[0];
+            clip_g_backend = backends[1];
+            t5_backend     = backends[2];
+            if (backends.size() > 3) {
+                LOG_WARN("More than 3 clip backends provided, but the model only supports 3 text encoders. Ignoring the rest.");
+            }
+        }
+
         for (auto pair : tensor_storage_map) {
             if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
                 use_clip_l = true;
@@ -723,13 +756,16 @@ struct SD3CLIPEmbedder : public Conditioner {
             return;
         }
         if (use_clip_l) {
-            clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+            LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend));
+            clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
         }
         if (use_clip_g) {
-            clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+            LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend));
+            clip_g = std::make_shared<CLIPTextModelRunner>(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
         }
         if (use_t5) {
-            t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
+            LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(t5_backend));
+            t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
         }
     }
 
@@ -1123,11 +1159,25 @@ struct FluxCLIPEmbedder : public Conditioner {
     std::shared_ptr<T5Runner> t5;
     size_t chunk_len = 256;
 
-    FluxCLIPEmbedder(ggml_backend_t backend,
+    FluxCLIPEmbedder(std::vector<ggml_backend_t> backends,
                      bool offload_params_to_cpu,
                      const String2TensorStorage& tensor_storage_map = {}) {
         bool use_clip_l = false;
         bool use_t5     = false;
+
+
+        ggml_backend_t clip_l_backend, t5_backend;
+        if (backends.size() == 1) {
+            clip_l_backend = t5_backend = backends[0];
+        } else if (backends.size() >= 2) {
+            clip_l_backend = backends[0];
+            t5_backend = backends[1];
+            if (backends.size() > 2) {
+                LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
+            }
+        }
+
+
         for (auto pair : tensor_storage_map) {
             if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
                 use_clip_l = true;
@@ -1142,12 +1192,14 @@ struct FluxCLIPEmbedder : public Conditioner {
         }
 
         if (use_clip_l) {
-            clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
+            LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend));
+            clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
         } else {
             LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
         }
         if (use_t5) {
-            t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
+            LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(clip_l_backend));
+            t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
         } else {
             LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
         }
diff --git a/docs/rpc.md b/docs/rpc.md
new file mode 100644
index 000000000..44485478c
--- /dev/null
+++ b/docs/rpc.md
@@ -0,0 +1,220 @@
+# Building and Using the RPC Server with `stable-diffusion.cpp`
+
+This guide covers how to build a version of [the RPC server from `llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/tools/rpc/README.md) that is compatible with your version of `stable-diffusion.cpp` to manage multi-backends setups. RPC allows you to offload specific model components to a remote server.
+
+> **Note on Model Location:** The model files (e.g., `.safetensors` or `.gguf`) remain on the **Client** machine. The client parses the file and transmits the necessary tensor data and computational graphs to the server. The server does not need to store the model files locally.
+
+## 1. Building `stable-diffusion.cpp` with RPC client
+
+First, you should build the client application from source. It requires `GGML_RPC=ON` to include the RPC backend to your client.
+
+```bash
+mkdir build
+cd build
+cmake .. \
+    -DGGML_RPC=ON \
+    # Add other build flags here (e.g., -DSD_VULKAN=ON)
+cmake --build . --config Release -j $(nproc)
+```
+
+> **Note:** Ensure you add the other flags you would normally use (e.g., `-DSD_VULKAN=ON`, `-DSD_CUDA=ON`, `-DSD_HIPBLAS=ON`, or `-DGGML_METAL=ON`), for more information about building `stable-diffusion.cpp` from source, please refer to the [build.md](build.md) documentation.
+
+## 2. Ensure `llama.cpp` is at the correct commit
+
+`stable-diffusion.cpp`'s RPC client is designed to work with a specific version of `llama.cpp` (compatible with the `ggml` submodule) to ensure API compatibility. The commit hash for `llama.cpp` is stored in `ggml/scripts/sync-llama.last`.
+
+> **Start from Root:** Perform these steps from the root of your `stable-diffusion.cpp` directory.
+
+1.  Read the target commit hash from the submodule tracker:
+
+    ```bash
+    # Linux / WSL / MacOS
+    HASH=$(cat ggml/scripts/sync-llama.last)
+
+    # Windows (PowerShell)
+    $HASH = Get-Content -Path "ggml\scripts\sync-llama.last"
+    ```
+
+2.  Clone `llama.cpp` at the target commit .
+    ```bash
+    git clone https://github.com/ggml-org/llama.cpp.git
+    cd llama.cpp
+    git checkout $HASH
+    ```
+    To save on download time and storage, you can use a shallow clone to download only the target commit:
+    ```bash
+    mkdir -p llama.cpp
+    cd llama.cpp
+    git init
+    git remote add origin https://github.com/ggml-org/llama.cpp.git
+    git fetch --depth 1 origin $HASH
+    git checkout FETCH_HEAD
+    ```
+
+## 3. Build `llama.cpp` (RPC Server)
+
+The RPC server acts as the worker. You must explicitly enable the **backend** (the hardware interface, such as CUDA for Nvidia, Metal for Apple Silicon, or Vulkan) when building, otherwise the server will default to using only the CPU.
+
+To find the correct flags for your system, refer to the official documentation for the [`llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) repository.
+
+> **Crucial:** You must include the compiler flags required to satisfy the API compatibility with `stable-diffusion.cpp` (`-DGGML_MAX_NAME=128`). Without this flag, `GGML_MAX_NAME` will default to `64` for the server, and data transfers between the client and server will fail. Of course, `-DGGML_RPC` must also be enabled.
+>
+> I recommend disabling the `LLAMA_CURL` flag to avoid unnecessary dependencies, and disabling shared library builds to avoid potential conflicts.
+
+> **Build Target:** We are specifically building the `rpc-server` target. This prevents the build system from compiling the entire `llama.cpp` suite (like `llama-server`), making the build significantly faster.
+
+### Linux / WSL (Vulkan)
+
+```bash
+mkdir build
+cd build
+cmake .. -DGGML_RPC=ON \
+    -DGGML_VULKAN=ON \        # Ensure backend is enabled
+    -DGGML_BUILD_SHARED_LIBS=OFF \
+    -DLLAMA_CURL=OFF \
+    -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 \
+    -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
+cmake --build . --config Release --target rpc-server -j $(nproc)
+```
+
+### macOS (Metal)
+
+```bash
+mkdir build
+cd build
+cmake .. -DGGML_RPC=ON \
+    -DGGML_METAL=ON \
+    -DGGML_BUILD_SHARED_LIBS=OFF \
+    -DLLAMA_CURL=OFF \
+    -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 \
+    -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
+cmake --build . --config Release --target rpc-server
+```
+
+### Windows (Visual Studio 2022, Vulkan)
+
+```powershell
+mkdir build
+cd build
+cmake .. -G "Visual Studio 17 2022" -A x64 `
+    -DGGML_RPC=ON `
+    -DGGML_VULKAN=ON `
+    -DGGML_BUILD_SHARED_LIBS=OFF `
+    -DLLAMA_CURL=OFF `
+    -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 `
+    -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
+cmake --build . --config Release --target rpc-server
+```
+
+## 4. Usage
+
+Once both applications are built, you can run the server and the client to manage your GPU allocation.
+
+### Step A: Run the RPC Server
+
+Start the server. It listens for connections on the default address (usually `localhost:50052`). If your server is on a different machine, ensure the server binds to the correct interface and your firewall allows the connection.
+
+**On the Server :**
+If running on the same machine, you can use the default address:
+
+```bash
+./rpc-server
+```
+
+If you want to allow connections from other machines on the network:
+
+```bash
+./rpc-server --host 0.0.0.0
+```
+
+> **Security Warning:** The RPC server does not currently support authentication or encryption. **Only run the server on trusted local networks**. Never expose the RPC server directly to the open internet.
+
+> **Drivers & Hardware:** Ensure the Server machine has the necessary drivers installed and functional (e.g., Nvidia Drivers for CUDA, Vulkan SDK, or Metal). If no devices are found, the server will simply fallback to CPU usage.
+
+### Step B: Check if the client is able to connect to the server and see the available devices
+
+We're assuming the server is running on your local machine, and listening on the default port `50052`. If it's running on a different machine, you can replace `localhost` with the IP address of the server.
+
+**On the Client:**
+
+```bash
+./sd-cli --rpc localhost:50052 --list-devices
+```
+
+If the server is running and the client is able to connect, you should see `RPC0    localhost:50052` in the list of devices.
+
+Example output:
+(Client built without GPU acceleration, two GPUs available on the server)
+
+```
+List of available GGML devices:
+Name    Description
+-------------------
+CPU     AMD Ryzen 9 5900X 12-Core Processor
+RPC0    localhost:50052
+RPC1    localhost:50052
+```
+
+### Step C: Run with RPC device
+
+If everything is working correctly, you can now run the client while offloading some or all of the work to the RPC server.
+
+Example: Setting the main backend to the RPC0 device for doing all the work on the server.
+
+```bash
+./sd-cli -m models/sd1.5.safetensors -p "A cat" --rpc localhost:50052 --main-backend-device RPC0
+```
+
+---
+
+## 5. Scaling: Multiple RPC Servers
+
+You can connect the client to multiple RPC servers simultaneously to scale out your hardware usage.
+
+Example: A main machine (192.168.1.10) with 3 GPUs, with one GPU running CUDA and the other two running Vulkan, and a second machine (192.168.1.11) only one GPU.
+
+**On the first machine (Running two server instances):**
+
+**Terminal 1 (CUDA):**
+
+```bash
+# Linux / WSL
+export CUDA_VISIBLE_DEVICES=0
+cd ./build_cuda/bin/Release
+./rpc-server --host 0.0.0.0
+
+# Windows PowerShell
+$env:CUDA_VISIBLE_DEVICES="0"
+cd .\build_cuda\bin\Release
+./rpc-server --host 0.0.0.0
+```
+
+**Terminal 2 (Vulkan):**
+
+```bash
+cd ./build_vulkan/bin/Release
+# ignore the first GPU (used by CUDA server)
+./rpc-server --host 0.0.0.0 --port 50053 -d Vulkan1,Vulkan2
+```
+
+**On the second machine:**
+
+```bash
+cd ./build/bin/Release
+./rpc-server --host 0.0.0.0
+```
+
+**On the Client:**
+Pass multiple server addresses separated by commas.
+
+```bash
+./sd-cli --rpc 192.168.1.10:50052,192.168.1.10:50053,192.168.1.11:50052 --list-devices
+```
+
+The client will map these servers to sequential device IDs (e.g., RPC0 from the first server, RPC2, RPC3 from the second, and RPC4 from the third). With this setup, you could for example use RPC0 for the main backend, RPC1 and RPC2 for the text encoders, and RPC3 for the VAE.
+
+---
+
+## 6. Performance Considerations
+
+RPC performance is heavily dependent on network bandwidth, as large weights and activations must be transferred back and forth over the network, especially for large models, or when using high resolutions. For best results, ensure your network connection is stable and has sufficient bandwidth (>1Gbps recommended).
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 84dd5c716..0a29a1982 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -5,10 +5,10 @@ usage: ./bin/sd-cli  [options]
 
 CLI Options:
   -o, --output <string>       path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png)
-  --output-begin-idx <int>    starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
   --preview-path <string>     path to write preview image to (default: ./preview.png)
   --preview-interval <int>    interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
                               every step)
+  --output-begin-idx <int>    starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
   --canny                     apply canny preprocessor (edge detection)
   --convert-name              convert tensor name (for convert mode)
   -v, --verbose               print extra info
@@ -18,6 +18,8 @@ CLI Options:
   -M, --mode                  run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
   --preview                   preview method. must be one of the following [none, proj, tae, vae] (default is none)
   -h, --help                  show this help message and exit
+  --rpc                       add a rpc device
+  --list-devices              list available ggml compute devices
 
 Context Options:
   -m, --model <string>                     path to full model
@@ -40,6 +42,17 @@ Context Options:
   --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
   --photo-maker <string>                   path to PHOTOMAKER model
   --upscale-model <string>                 path to esrgan model.
+  --main-backend-device <string>           default device to use for all backends (defaults to main gpu device if hardware acceleration is available, otherwise
+                                           cpu)
+  --diffusion-backend-device <string>      device to use for diffusion (defaults to main-backend-device)
+  --clip-backend-device <string>           device to use for clip (defaults to main-backend-device). Can be a comma-separated list of devices for models with
+                                           multiple encoders
+  --vae-backend-device <string>            device to use for vae (defaults to main-backend-device). Also applies to tae, unless tae-backend-device is specified
+  --tae-backend-device <string>            device to use for tae (defaults to vae-backend-device)
+  --control-net-backend-device <string>    device to use for control net (defaults to main-backend-device)
+  --upscaler-backend-device <string>       device to use for upscaling models (defaults to main-backend-device)
+  --photomaker-backend-device <string>     device to use for photomaker (defaults to main-backend-device)
+  --vision-backend-device <string>         device to use for clip-vision model (defaults to main-backend-device)
   -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
                                            CPU physical cores
   --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
@@ -49,9 +62,6 @@ Context Options:
   --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
   --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
   --mmap                                   whether to memory-map model
-  --control-net-cpu                        keep controlnet in cpu (for low vram)
-  --clip-on-cpu                            keep clip in cpu (for low vram)
-  --vae-on-cpu                             keep vae in cpu (for low vram)
   --diffusion-fa                           use flash attention in the diffusion model
   --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
   --vae-conv-direct                        use ggml_conv2d_direct in the vae model
@@ -59,6 +69,7 @@ Context Options:
   --circularx                              enable circular RoPE wrapping on x-axis (width) only
   --circulary                              enable circular RoPE wrapping on y-axis (height) only
   --chroma-disable-dit-mask                disable dit mask for chroma
+  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
   --chroma-enable-t5-mask                  enable t5 mask for chroma
   --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                            type of the weight file
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index ab58ab5f0..eb79a51cc 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -46,6 +46,7 @@ struct SDCliParams {
     bool color               = false;
 
     bool normal_exit = false;
+    bool skip_usage = false;
 
     ArgOptions get_options() {
         ArgOptions options;
@@ -143,7 +144,27 @@ struct SDCliParams {
 
         auto on_help_arg = [&](int argc, const char** argv, int index) {
             normal_exit = true;
-            return -1;
+            return VALID_BREAK_OPT;
+        };
+
+        auto on_rpc_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            const char* rpc_device = argv[index];
+            add_rpc_device(rpc_device);
+            return 1; 
+        };
+
+        auto on_list_devices_arg = [&](int argc, const char** argv, int index) {
+            size_t buff_size = backend_list_size();
+            char* buff = (char*)malloc(buff_size);
+            list_backends_to_buffer(buff, buff_size);
+            printf("List of available GGML devices:\nName\tDescription\n-------------------\n%s\n", buff);
+            free(buff);
+            normal_exit = true;
+            skip_usage = true;
+            return VALID_BREAK_OPT;
         };
 
         options.manual_options = {
@@ -159,6 +180,14 @@ struct SDCliParams {
              "--help",
              "show this help message and exit",
              on_help_arg},
+            {"",
+             "--rpc",
+             "add a rpc device", 
+             on_rpc_arg},
+             {"",
+              "--list-devices",
+              "list available ggml compute devices",
+              on_list_devices_arg},
         };
 
         return options;
@@ -213,7 +242,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
     std::vector<ArgOptions> options_vec = {cli_params.get_options(), ctx_params.get_options(), gen_params.get_options()};
 
     if (!parse_options(argc, argv, options_vec)) {
-        print_usage(argc, argv, options_vec);
+        if (!cli_params.skip_usage){
+            print_usage(argc, argv, options_vec);
+        }
         exit(cli_params.normal_exit ? 0 : 1);
     }
 
@@ -783,7 +814,8 @@ int main(int argc, const char* argv[]) {
                                                         ctx_params.offload_params_to_cpu,
                                                         ctx_params.diffusion_conv_direct,
                                                         ctx_params.n_threads,
-                                                        gen_params.upscale_tile_size);
+                                                        gen_params.upscale_tile_size,
+                                                        ctx_params.upscaler_backend_device.c_str());
 
         if (upscaler_ctx == nullptr) {
             LOG_ERROR("new_upscaler_ctx failed");
diff --git a/examples/common/common.hpp b/examples/common/common.hpp
index ba1b0d8d9..6ee16344d 100644
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@@ -34,6 +34,8 @@ namespace fs = std::filesystem;
 #define SAFE_STR(s) ((s) ? (s) : "")
 #define BOOL_STR(b) ((b) ? "true" : "false")
 
+#define VALID_BREAK_OPT -42
+
 const char* modes_str[] = {
     "img_gen",
     "vid_gen",
@@ -401,16 +403,26 @@ static bool parse_options(int argc, const char** argv, const std::vector<ArgOpti
                 }))
                 break;
 
+            bool kill_flow = false;
             if (match_and_apply(options.manual_options, [&](auto& option) {
                     int ret = option.cb(argc, argv, i);
+                    if (ret == VALID_BREAK_OPT) {
+                        // not an error, but still break out of the loop (e.g. --help)
+                        kill_flow = true;
+                        return;
+                    }
                     if (ret < 0) {
                         invalid_arg = true;
                         return;
                     }
                     i += ret;
                     found_arg = true;
-                }))
+                })) {
+                if (kill_flow) {
+                    return false;
+                }
                 break;
+            }
         }
 
         if (invalid_arg) {
@@ -447,6 +459,16 @@ struct SDContextParams {
     std::string tensor_type_rules;
     std::string lora_model_dir = ".";
 
+    std::string main_backend_device;
+    std::string diffusion_backend_device;
+    std::string clip_backend_device;
+    std::string vae_backend_device;
+    std::string tae_backend_device;
+    std::string control_net_backend_device;
+    std::string upscaler_backend_device;
+    std::string photomaker_backend_device;
+    std::string vision_backend_device;
+
     std::map<std::string, std::string> embedding_map;
     std::vector<sd_embedding_t> embedding_vec;
 
@@ -454,9 +476,6 @@ struct SDContextParams {
     rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
     bool offload_params_to_cpu  = false;
     bool enable_mmap            = false;
-    bool control_net_cpu        = false;
-    bool clip_on_cpu            = false;
-    bool vae_on_cpu             = false;
     bool diffusion_flash_attn   = false;
     bool diffusion_conv_direct  = false;
     bool vae_conv_direct        = false;
@@ -561,6 +580,43 @@ struct SDContextParams {
              "--upscale-model",
              "path to esrgan model.",
              &esrgan_path},
+            {"",
+             "--main-backend-device",
+             "default device to use for all backends (defaults to main gpu device if hardware acceleration is available, otherwise cpu)",
+             &main_backend_device},
+            {"",
+             "--diffusion-backend-device",
+             "device to use for diffusion (defaults to main-backend-device)",
+             &diffusion_backend_device},
+            {"",
+             "--clip-backend-device",
+             "device to use for clip (defaults to main-backend-device). Can be a comma-separated list of devices for models with multiple encoders",
+             &clip_backend_device},
+            {"",
+             "--vae-backend-device",
+             "device to use for vae (defaults to main-backend-device). Also applies to tae, unless tae-backend-device is specified",
+             &vae_backend_device},
+            {"",
+             "--tae-backend-device",
+             "device to use for tae (defaults to vae-backend-device)",
+             &tae_backend_device},
+            {"",
+             "--control-net-backend-device",
+             "device to use for control net (defaults to main-backend-device)",
+             &control_net_backend_device},
+             {"",
+             "--upscaler-backend-device",
+             "device to use for upscaling models (defaults to main-backend-device)",
+             &upscaler_backend_device},
+             {"",
+             "--photomaker-backend-device",
+             "device to use for photomaker (defaults to main-backend-device)",
+             &photomaker_backend_device},
+             {"",
+             "--vision-backend-device",
+             "device to use for clip-vision model (defaults to main-backend-device)",
+             &vision_backend_device},
+
         };
 
         options.int_options = {
@@ -603,18 +659,6 @@ struct SDContextParams {
              "--mmap",
              "whether to memory-map model",
              true, &enable_mmap},
-            {"",
-             "--control-net-cpu",
-             "keep controlnet in cpu (for low vram)",
-             true, &control_net_cpu},
-            {"",
-             "--clip-on-cpu",
-             "keep clip in cpu (for low vram)",
-             true, &clip_on_cpu},
-            {"",
-             "--vae-on-cpu",
-             "keep vae in cpu (for low vram)",
-             true, &vae_on_cpu},
             {"",
              "--diffusion-fa",
              "use flash attention in the diffusion model",
@@ -875,6 +919,7 @@ struct SDContextParams {
 
         std::string embeddings_str = emb_ss.str();
         std::ostringstream oss;
+        // TODO backend devices
         oss << "SDContextParams {\n"
             << "  n_threads: " << n_threads << ",\n"
             << "  model_path: \"" << model_path << "\",\n"
@@ -901,9 +946,9 @@ struct SDContextParams {
             << "  flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n"
             << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
             << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
-            << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
-            << "  clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
-            << "  vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
+            // << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
+            // << "  clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
+            // << "  vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
             << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
             << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
             << "  vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n"
@@ -965,9 +1010,6 @@ struct SDContextParams {
             lora_apply_mode,
             offload_params_to_cpu,
             enable_mmap,
-            clip_on_cpu,
-            control_net_cpu,
-            vae_on_cpu,
             diffusion_flash_attn,
             taesd_preview,
             diffusion_conv_direct,
@@ -980,6 +1022,14 @@ struct SDContextParams {
             chroma_t5_mask_pad,
             qwen_image_zero_cond_t,
             flow_shift,
+            main_backend_device.c_str(),
+            diffusion_backend_device.c_str(),
+            clip_backend_device.c_str(),
+            vae_backend_device.c_str(),
+            tae_backend_device.c_str(),
+            control_net_backend_device.c_str(),
+            photomaker_backend_device.c_str(),
+            vision_backend_device.c_str(),
         };
         return sd_ctx_params;
     }
diff --git a/examples/server/README.md b/examples/server/README.md
index 7e6681570..a6000ad0f 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -4,12 +4,12 @@
 usage: ./bin/sd-server  [options]
 
 Svr Options:
-  -l, --listen-ip <string>    server listen ip (default: 127.0.0.1)
-  --listen-port <int>         server listen port (default: 1234)
-  --serve-html-path <string>  path to HTML file to serve at root (optional)
-  -v, --verbose               print extra info
-  --color                     colors the logging tags according to level
-  -h, --help                  show this help message and exit
+  -l, --listen-ip <string>      server listen ip (default: 127.0.0.1)
+  --serve-html-path <string>    path to HTML file to serve at root (optional)
+  --listen-port <int>           server listen port (default: 1234)
+  -v, --verbose                 print extra info
+  --color                       colors the logging tags according to level
+  -h, --help                    show this help message and exit
 
 Context Options:
   -m, --model <string>                     path to full model
@@ -32,6 +32,17 @@ Context Options:
   --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
   --photo-maker <string>                   path to PHOTOMAKER model
   --upscale-model <string>                 path to esrgan model.
+  --main-backend-device <string>           default device to use for all backends (defaults to main gpu device if hardware acceleration is available, otherwise
+                                           cpu)
+  --diffusion-backend-device <string>      device to use for diffusion (defaults to main-backend-device)
+  --clip-backend-device <string>           device to use for clip (defaults to main-backend-device). Can be a comma-separated list of devices for models with
+                                           multiple encoders
+  --vae-backend-device <string>            device to use for vae (defaults to main-backend-device). Also applies to tae, unless tae-backend-device is specified
+  --tae-backend-device <string>            device to use for tae (defaults to vae-backend-device)
+  --control-net-backend-device <string>    device to use for control net (defaults to main-backend-device)
+  --upscaler-backend-device <string>       device to use for upscaling models (defaults to main-backend-device)
+  --photomaker-backend-device <string>     device to use for photomaker (defaults to main-backend-device)
+  --vision-backend-device <string>         device to use for clip-vision model (defaults to main-backend-device)
   -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
                                            CPU physical cores
   --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
@@ -40,9 +51,6 @@ Context Options:
   --vae-tiling                             process vae in tiles to reduce memory usage
   --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
   --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
-  --control-net-cpu                        keep controlnet in cpu (for low vram)
-  --clip-on-cpu                            keep clip in cpu (for low vram)
-  --vae-on-cpu                             keep vae in cpu (for low vram)
   --mmap                                   whether to memory-map model
   --diffusion-fa                           use flash attention in the diffusion model
   --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
@@ -51,6 +59,7 @@ Context Options:
   --circularx                              enable circular RoPE wrapping on x-axis (width) only
   --circulary                              enable circular RoPE wrapping on y-axis (height) only
   --chroma-disable-dit-mask                disable dit mask for chroma
+  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
   --chroma-enable-t5-mask                  enable t5 mask for chroma
   --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                            type of the weight file
diff --git a/ggml b/ggml
index 8891ab6fc..b6d1f0f24 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 8891ab6fc742ac1198736d3da3b73c730e42af84
+Subproject commit b6d1f0f247adcfa25c0ca1ffe97e651fe1afd5e2
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 7dac03738..2a587549e 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -28,26 +28,6 @@
 
 #include "model.h"
 
-#ifdef SD_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef SD_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef SD_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#ifdef SD_USE_OPENCL
-#include "ggml-opencl.h"
-#endif
-
-#ifdef SD_USE_SYCL
-#include "ggml-sycl.h"
-#endif
-
 #include "rng.hpp"
 #include "util.h"
 
@@ -88,6 +68,42 @@ __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const cha
     }
 }
 
+__STATIC_INLINE__ bool backend_name_exists(std::string name) {
+    const int device_count = ggml_backend_dev_count();
+    for (int i = 0; i < device_count; i++) {
+        if (name == ggml_backend_dev_name(ggml_backend_dev_get(i))) {
+            return true;
+        }
+    }
+    return false;
+}
+
+__STATIC_INLINE__ std::string sanitize_backend_name(std::string name) {
+    if (name == "" || backend_name_exists(name)) {
+        return name;
+    } else {
+        LOG_WARN("Backend %s not found, using default backend", name.c_str());
+        return "";
+    }
+}
+
+__STATIC_INLINE__ std::string get_default_backend_name() {
+    // should pick the same backend as ggml_backend_init_best
+    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
+    dev                    = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
+    dev                    = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    return ggml_backend_dev_name(dev);
+}
+
+__STATIC_INLINE__ ggml_backend_t init_named_backend(std::string name = "") {
+    LOG_DEBUG("Initializing backend: %s", name.c_str());
+    if (name.empty()) {
+        return ggml_backend_init_best();
+    } else {
+        return ggml_backend_init_by_name(name.c_str(), nullptr);
+    }
+}
+
 static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");
 
 // n-mode tensor-matrix product
@@ -2220,6 +2236,14 @@ class Linear : public UnaryBlock {
           force_prec_f32(force_prec_f32),
           scale(scale) {}
 
+    void set_scale(float scale_){
+        scale = scale_;
+    }
+
+    void set_force_prec_f32(bool force_prec_f32_){
+        force_prec_f32 = force_prec_f32_;
+    }
+
     struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
         struct ggml_tensor* b = nullptr;
diff --git a/model.cpp b/model.cpp
index 253dd25cd..786b8c739 100644
--- a/model.cpp
+++ b/model.cpp
@@ -29,18 +29,6 @@
 #include "name_conversion.h"
 #include "stable-diffusion.h"
 
-#ifdef SD_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef SD_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#ifdef SD_USE_OPENCL
-#include "ggml-opencl.h"
-#endif
-
 #define ST_HEADER_SIZE_LEN 8
 
 uint64_t read_u64(uint8_t* buffer) {
diff --git a/qwen_image.hpp b/qwen_image.hpp
index dfa539788..87952ef2d 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -7,6 +7,10 @@
 #include "flux.hpp"
 #include "ggml_extend.hpp"
 
+#ifdef SD_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
 namespace Qwen {
     constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480;
 
@@ -96,9 +100,7 @@ namespace Qwen {
 
             float scale         = 1.f / 32.f;
             bool force_prec_f32 = false;
-#ifdef SD_USE_VULKAN
-            force_prec_f32 = true;
-#endif
+
             // The purpose of the scale here is to prevent NaN issues in certain situations.
             // For example when using CUDA but the weights are k-quants (not all prompts).
             blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
@@ -124,6 +126,11 @@ namespace Qwen {
             auto to_k     = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
             auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
             auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
+#ifdef SD_USE_VULKAN
+            if(ggml_backend_is_vk(ctx->backend)){
+                to_out_0->set_force_prec_f32(true);
+            }
+#endif
 
             auto norm_added_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_q"]);
             auto norm_added_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_k"]);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index b181f994b..60e4c6338 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1,3 +1,4 @@
+#include "ggml-cpu.h"
 #include "ggml_extend.hpp"
 
 #include "model.h"
@@ -5,6 +6,7 @@
 #include "rng_mt19937.hpp"
 #include "rng_philox.hpp"
 #include "stable-diffusion.h"
+#include <vector>
 #include "util.h"
 
 #include "cache_dit.hpp"
@@ -94,14 +96,129 @@ void suppress_pp(int step, int steps, float time, void* data) {
     return;
 }
 
+std::vector<std::string> string_split(const std::string & input, char separator)
+{
+    std::vector<std::string> parts;
+    size_t begin_pos = 0;
+    size_t separator_pos = input.find(separator);
+    while (separator_pos != std::string::npos) {
+        std::string part = input.substr(begin_pos, separator_pos - begin_pos);
+        parts.emplace_back(part);
+        begin_pos = separator_pos + 1;
+        separator_pos = input.find(separator, begin_pos);
+    }
+    parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
+    return parts;
+}
+
+static void add_rpc_devices(const std::string & servers) {
+    auto rpc_servers = string_split(servers, ',');
+    if (rpc_servers.empty()) {
+        throw std::invalid_argument("no RPC servers specified");
+    }
+    ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+    if (!rpc_reg) {
+        throw std::invalid_argument("failed to find RPC backend");
+    }
+    typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char * endpoint);
+    ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
+    if (!ggml_backend_rpc_add_server_fn) {
+        throw std::invalid_argument("failed to find RPC add server function");
+    }
+    for (const auto & server : rpc_servers) {
+        auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
+        ggml_backend_register(reg);
+    }
+}
+
+void add_rpc_device(const char* servers_cstr){
+    std::string servers(servers_cstr);
+    add_rpc_devices(servers);
+}
+
+std::vector<std::string> sanitize_backend_name_list(std::string name) {
+    std::vector<std::string> vec = {};
+    if (name == "" || backend_name_exists(name)) {
+        // single backend
+        vec.push_back(name);
+    } else if (name.find(",") != std::string::npos) {
+        // comma-separated backend names
+        std::stringstream ss(name);
+        std::string token;
+        while (std::getline(ss, token, ',')) {
+            if (token == "" || backend_name_exists(token)) {
+                vec.push_back(token);
+            } else {
+                LOG_WARN("backend name %s not found, using default", token.c_str());
+                vec.push_back("");
+            }
+        }
+    } else {
+        vec.push_back("");
+    }
+    return vec;
+}
+
+std::vector<std::pair<std::string, std::string>> list_backends_vector() {
+    std::vector<std::pair<std::string, std::string>> backends;
+    const int device_count = ggml_backend_dev_count();
+    for (int i = 0; i < device_count; i++) {
+        auto dev = ggml_backend_dev_get(i);
+        backends.push_back({ggml_backend_dev_name(dev), ggml_backend_dev_description(dev)});
+    }
+    return backends;
+}
+
+SD_API size_t backend_list_size(){
+    // for C API
+    size_t buffer_size = 0;
+    auto backends = list_backends_vector();
+    for (auto& backend : backends) {
+        auto dev_name_size = backend.first.size();
+        auto dev_desc_size = backend.second.size();
+        buffer_size+=dev_name_size+dev_desc_size+2; // +2 for the separators
+    }
+    return buffer_size;
+}
+
+// devices are separated by \n and name and description are separated by \t
+SD_API void list_backends_to_buffer(char* buffer, size_t buffer_size) {
+    auto backends = list_backends_vector();
+    size_t offset = 0;
+    for (auto& backend : backends) {
+        size_t name_size = backend.first.size();
+        size_t desc_size = backend.second.size();
+        if (offset + name_size + desc_size + 2 > buffer_size) {
+            break; // Not enough space in the buffer
+        }
+        memcpy(buffer + offset, backend.first.c_str(), name_size);
+        offset += name_size;
+        buffer[offset++] = '\t';
+        memcpy(buffer + offset, backend.second.c_str(), desc_size);
+        offset += desc_size;
+        buffer[offset++] = '\n'; 
+    }
+    if (offset < buffer_size) {
+        buffer[offset] = '\0'; // Ensure the buffer is null-terminated at the end
+    } else {
+        LOG_WARN("Provided buffer size is too small to contain details of all devices.");
+        buffer[buffer_size - 1] = '\0';  // Ensure the buffer is null-terminated at the end
+    }
+}
+
 /*=============================================== StableDiffusionGGML ================================================*/
 
 class StableDiffusionGGML {
 public:
     ggml_backend_t backend             = nullptr;  // general backend
-    ggml_backend_t clip_backend        = nullptr;
+    ggml_backend_t diffusion_backend   = nullptr;
     ggml_backend_t control_net_backend = nullptr;
     ggml_backend_t vae_backend         = nullptr;
+    ggml_backend_t tae_backend         = nullptr;
+    ggml_backend_t pmid_backend        = nullptr;
+    ggml_backend_t vision_backend      = nullptr;
+
+    std::vector<ggml_backend_t> clip_backends        = {nullptr};
 
     SDVersion version;
     bool vae_decode_only         = false;
@@ -147,72 +264,32 @@ class StableDiffusionGGML {
     StableDiffusionGGML() = default;
 
     ~StableDiffusionGGML() {
-        if (clip_backend != backend) {
-            ggml_backend_free(clip_backend);
+        if (diffusion_backend != backend) {
+            ggml_backend_free(diffusion_backend);
+        }
+        for(auto clip_backend : clip_backends) {
+            if (clip_backend != backend) {
+                ggml_backend_free(clip_backend);
+            }
         }
         if (control_net_backend != backend) {
             ggml_backend_free(control_net_backend);
         }
+        if (tae_backend != vae_backend) {
+            ggml_backend_free(tae_backend);
+        }
         if (vae_backend != backend) {
             ggml_backend_free(vae_backend);
         }
         ggml_backend_free(backend);
     }
 
-    void init_backend() {
-#ifdef SD_USE_CUDA
-        LOG_DEBUG("Using CUDA backend");
-        backend = ggml_backend_cuda_init(0);
-#endif
-#ifdef SD_USE_METAL
-        LOG_DEBUG("Using Metal backend");
-        backend = ggml_backend_metal_init();
-#endif
-#ifdef SD_USE_VULKAN
-        LOG_DEBUG("Using Vulkan backend");
-        size_t device          = 0;
-        const int device_count = ggml_backend_vk_get_device_count();
-        if (device_count) {
-            const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE");
-            if (SD_VK_DEVICE != nullptr) {
-                std::string sd_vk_device_str = SD_VK_DEVICE;
-                try {
-                    device = std::stoull(sd_vk_device_str);
-                } catch (const std::invalid_argument&) {
-                    LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to device 0.", SD_VK_DEVICE);
-                    device = 0;
-                } catch (const std::out_of_range&) {
-                    LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to device 0.", SD_VK_DEVICE);
-                    device = 0;
-                }
-                if (device >= device_count) {
-                    LOG_WARN("Cannot find targeted vulkan device (%llu). Falling back to device 0.", device);
-                    device = 0;
-                }
-            }
-            LOG_INFO("Vulkan: Using device %llu", device);
-            backend = ggml_backend_vk_init(device);
-        }
-        if (!backend) {
-            LOG_WARN("Failed to initialize Vulkan backend");
-        }
-#endif
-#ifdef SD_USE_OPENCL
-        LOG_DEBUG("Using OpenCL backend");
-        // ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs
-        backend = ggml_backend_opencl_init();
-        if (!backend) {
-            LOG_WARN("Failed to initialize OpenCL backend");
-        }
-#endif
-#ifdef SD_USE_SYCL
-        LOG_DEBUG("Using SYCL backend");
-        backend = ggml_backend_sycl_init(0);
-#endif
 
-        if (!backend) {
-            LOG_DEBUG("Using CPU backend");
-            backend = ggml_backend_cpu_init();
+    void log_backends() {
+        const int device_count = ggml_backend_dev_count();
+        for (int i = 0; i < device_count; i++) {
+            auto dev = ggml_backend_dev_get(i);
+            LOG_INFO("%s (%s)", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev));
         }
     }
 
@@ -243,7 +320,54 @@ class StableDiffusionGGML {
 
         ggml_log_set(ggml_log_callback_default, nullptr);
 
-        init_backend();
+        log_backends();
+
+        std::string default_backend_name = get_default_backend_name();
+
+        std::string override_default_backend_name = sanitize_backend_name(SAFE_STR(sd_ctx_params->main_device));
+
+        if (override_default_backend_name.size() > 0) {
+            LOG_INFO("Setting default backend to %s", override_default_backend_name.c_str());
+            default_backend_name = override_default_backend_name;
+        }
+
+        std::string diffusion_backend_name   = sanitize_backend_name(SAFE_STR(sd_ctx_params->diffusion_device));
+        std::vector<std::string> clip_backend_names        = sanitize_backend_name_list(SAFE_STR(sd_ctx_params->clip_device));
+        std::string control_net_backend_name = sanitize_backend_name(SAFE_STR(sd_ctx_params->control_net_device));
+        std::string vae_backend_name         = sanitize_backend_name(SAFE_STR(sd_ctx_params->vae_device));
+        std::string tae_backend_name         = sanitize_backend_name(SAFE_STR(sd_ctx_params->tae_device));
+        std::string pmid_backend_name        = sanitize_backend_name(SAFE_STR(sd_ctx_params->photomaker_device));
+        std::string vision_backend_name      = sanitize_backend_name(SAFE_STR(sd_ctx_params->vision_device));
+
+        bool diffusion_backend_is_default   = diffusion_backend_name.empty() || diffusion_backend_name == default_backend_name;
+        bool clip_backends_are_default = true;
+        for (const auto& clip_backend_name : clip_backend_names) {
+            if (!clip_backend_name.empty() && clip_backend_name != default_backend_name) {
+                clip_backends_are_default = false;
+                break;
+            }
+        }
+        bool control_net_backend_is_default = (control_net_backend_name.empty() || control_net_backend_name == default_backend_name);
+        bool vae_backend_is_default         = (vae_backend_name.empty() || vae_backend_name == default_backend_name);
+        // if tae_backend_name is empty, it will use the same backend as vae
+        bool tae_backend_is_default = (tae_backend_name.empty() && vae_backend_is_default) || tae_backend_name == default_backend_name;
+        bool pmid_backend_is_default = (pmid_backend_name.empty() || pmid_backend_name == default_backend_name);
+        bool vision_backend_is_default = (vision_backend_name.empty() || vision_backend_name == default_backend_name);
+
+        // if some backend is not specified or is the same as the default backend, use the default backend
+        bool use_default_backend = diffusion_backend_is_default || clip_backends_are_default || control_net_backend_is_default || vae_backend_is_default || tae_backend_is_default || pmid_backend_is_default || vision_backend_is_default;
+
+        if (use_default_backend) {
+            backend = init_named_backend(override_default_backend_name);
+            LOG_DEBUG("Loaded default backend %s", ggml_backend_name(backend));
+        }
+
+        if (!diffusion_backend_is_default) {
+            diffusion_backend = init_named_backend(diffusion_backend_name);
+            LOG_INFO("Using diffusion backend: %s", ggml_backend_name(diffusion_backend));
+        } else {
+            diffusion_backend = backend;
+        }
 
         ModelLoader model_loader;
 
@@ -419,21 +543,24 @@ class StableDiffusionGGML {
             LOG_INFO("Using circular padding for convolutions");
         }
 
-        bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
-
         {
-            clip_backend = backend;
-            if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
-                LOG_INFO("CLIP: Using CPU backend");
-                clip_backend = ggml_backend_cpu_init();
+            if (!clip_backends_are_default) {
+                clip_backends.clear();
+                for(auto clip_backend_name : clip_backend_names){
+                    auto clip_backend = init_named_backend(clip_backend_name);
+                    LOG_INFO("CLIP: Using %s backend", ggml_backend_name(clip_backend));
+                    clip_backends.push_back(clip_backend); 
+                }
+            }else{
+                clip_backends = {backend};
             }
             if (sd_version_is_sd3(version)) {
-                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
+                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backends,
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map);
-                diffusion_model  = std::make_shared<MMDiTModel>(backend,
-                                                               offload_params_to_cpu,
-                                                               tensor_storage_map);
+                diffusion_model  = std::make_shared<MMDiTModel>(diffusion_backend,
+                                                                offload_params_to_cpu,
+                                                                tensor_storage_map);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : tensor_storage_map) {
@@ -452,53 +579,53 @@ class StableDiffusionGGML {
                             "--chroma-disable-dit-mask as a workaround.");
                     }
 
-                    cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
+                    cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backends[0],
                                                                         offload_params_to_cpu,
                                                                         tensor_storage_map,
                                                                         sd_ctx_params->chroma_use_t5_mask,
                                                                         sd_ctx_params->chroma_t5_mask_pad);
                 } else if (version == VERSION_OVIS_IMAGE) {
-                    cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
+                    cond_stage_model = std::make_shared<LLMEmbedder>(clip_backends[0],
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map,
                                                                      version,
                                                                      "",
                                                                      false);
                 } else {
-                    cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend,
+                    cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backends,
                                                                           offload_params_to_cpu,
                                                                           tensor_storage_map);
                 }
-                diffusion_model = std::make_shared<FluxModel>(backend,
+                diffusion_model = std::make_shared<FluxModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               version,
                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_flux2(version)) {
                 bool is_chroma   = false;
-                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
+                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backends[0],
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
-                diffusion_model  = std::make_shared<FluxModel>(backend,
-                                                              offload_params_to_cpu,
-                                                              tensor_storage_map,
-                                                              version,
-                                                              sd_ctx_params->chroma_use_dit_mask);
+                diffusion_model  = std::make_shared<FluxModel>(diffusion_backend,
+                                                               offload_params_to_cpu,
+                                                               tensor_storage_map,
+                                                               version,
+                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_wan(version)) {
-                cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
+                cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backends[0],
                                                                     offload_params_to_cpu,
                                                                     tensor_storage_map,
                                                                     true,
                                                                     1,
                                                                     true);
-                diffusion_model  = std::make_shared<WanModel>(backend,
-                                                             offload_params_to_cpu,
-                                                             tensor_storage_map,
-                                                             "model.diffusion_model",
-                                                             version);
+                diffusion_model  = std::make_shared<WanModel>(diffusion_backend,
+                                                              offload_params_to_cpu,
+                                                              tensor_storage_map,
+                                                              "model.diffusion_model",
+                                                              version);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
-                    high_noise_diffusion_model = std::make_shared<WanModel>(backend,
+                    high_noise_diffusion_model = std::make_shared<WanModel>(diffusion_backend,
                                                                             offload_params_to_cpu,
                                                                             tensor_storage_map,
                                                                             "model.high_noise_diffusion_model",
@@ -507,7 +634,7 @@ class StableDiffusionGGML {
                 if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
                     diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" ||
                     diffusion_model->get_desc() == "Wan2.1-I2V-1.3B") {
-                    clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend,
+                    clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(vision_backend,
                                                                              offload_params_to_cpu,
                                                                              tensor_storage_map);
                     clip_vision->alloc_params_buffer();
@@ -518,48 +645,48 @@ class StableDiffusionGGML {
                 if (!vae_decode_only) {
                     enable_vision = true;
                 }
-                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
+                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backends[0],
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version,
                                                                  "",
                                                                  enable_vision);
-                diffusion_model  = std::make_shared<QwenImageModel>(backend,
-                                                                   offload_params_to_cpu,
-                                                                   tensor_storage_map,
-                                                                   "model.diffusion_model",
-                                                                   version,
-                                                                   sd_ctx_params->qwen_image_zero_cond_t);
+                diffusion_model  = std::make_shared<QwenImageModel>(diffusion_backend,
+                                                                    offload_params_to_cpu,
+                                                                    tensor_storage_map,
+                                                                    "model.diffusion_model",
+                                                                    version,
+                                                                    sd_ctx_params->qwen_image_zero_cond_t);
             } else if (sd_version_is_z_image(version)) {
-                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
+                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backends[0],
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
-                diffusion_model  = std::make_shared<ZImageModel>(backend,
-                                                                offload_params_to_cpu,
-                                                                tensor_storage_map,
-                                                                "model.diffusion_model",
-                                                                version);
+                diffusion_model  = std::make_shared<ZImageModel>(diffusion_backend,
+                                                                 offload_params_to_cpu,
+                                                                 tensor_storage_map,
+                                                                 "model.diffusion_model",
+                                                                 version);
             } else {  // SD1.x SD2.x SDXL
                 std::map<std::string, std::string> embbeding_map;
                 for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) {
                     embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path));
                 }
                 if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
-                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
+                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backends,
                                                                                            offload_params_to_cpu,
                                                                                            tensor_storage_map,
                                                                                            embbeding_map,
                                                                                            version,
                                                                                            PM_VERSION_2);
                 } else {
-                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
+                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backends,
                                                                                            offload_params_to_cpu,
                                                                                            tensor_storage_map,
                                                                                            embbeding_map,
                                                                                            version);
                 }
-                diffusion_model = std::make_shared<UNetModel>(backend,
+                diffusion_model = std::make_shared<UNetModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               version);
@@ -592,11 +719,15 @@ class StableDiffusionGGML {
                 high_noise_diffusion_model->get_param_tensors(tensors);
             }
 
-            if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
-                LOG_INFO("VAE Autoencoder: Using CPU backend");
-                vae_backend = ggml_backend_cpu_init();
-            } else {
-                vae_backend = backend;
+            vae_backend = backend;
+            if (!vae_backend_is_default) {
+                vae_backend = init_named_backend(vae_backend_name);
+                LOG_INFO("VAE Autoencoder: Using %s backend", ggml_backend_name(vae_backend));
+            }
+            tae_backend = vae_backend;
+            if (tae_backend_name.length() > 0 && tae_backend_name != vae_backend_name) {
+                tae_backend = init_named_backend(tae_backend_name);
+                LOG_INFO("Tiny Autoencoder: Using %s backend", ggml_backend_name(tae_backend));
             }
 
             if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
@@ -639,14 +770,14 @@ class StableDiffusionGGML {
             }
             if (use_tiny_autoencoder || version == VERSION_SDXS) {
                 if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
-                    tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(vae_backend,
+                    tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(tae_backend,
                                                                              offload_params_to_cpu,
                                                                              tensor_storage_map,
                                                                              "decoder",
                                                                              vae_decode_only,
                                                                              version);
                 } else {
-                    tae_first_stage = std::make_shared<TinyImageAutoEncoder>(vae_backend,
+                    tae_first_stage = std::make_shared<TinyImageAutoEncoder>(tae_backend,
                                                                              offload_params_to_cpu,
                                                                              tensor_storage_map,
                                                                              "decoder.layers",
@@ -664,14 +795,13 @@ class StableDiffusionGGML {
             }
 
             if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) {
-                ggml_backend_t controlnet_backend = nullptr;
-                if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) {
-                    LOG_DEBUG("ControlNet: Using CPU backend");
-                    controlnet_backend = ggml_backend_cpu_init();
+                if (!control_net_backend_is_default) {
+                    control_net_backend = init_named_backend(control_net_backend_name);
+                    LOG_INFO("ControlNet: Using %s backend", ggml_backend_name(control_net_backend));
                 } else {
-                    controlnet_backend = backend;
+                    control_net_backend = backend;
                 }
-                control_net = std::make_shared<ControlNet>(controlnet_backend,
+                control_net = std::make_shared<ControlNet>(control_net_backend,
                                                            offload_params_to_cpu,
                                                            tensor_storage_map,
                                                            version);
@@ -680,9 +810,15 @@ class StableDiffusionGGML {
                     control_net->set_conv2d_direct_enabled(true);
                 }
             }
-
+             pmid_backend = backend;
+            if (!pmid_backend_is_default) {
+                pmid_backend = init_named_backend(pmid_backend_name);
+                LOG_INFO("PhotoMaker: Using %s backend", ggml_backend_name(pmid_backend));
+            } else {
+                pmid_backend = backend;
+            }
             if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
-                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
+                pmid_model = std::make_shared<PhotoMakerIDEncoder>(pmid_backend,
                                                                    offload_params_to_cpu,
                                                                    tensor_storage_map,
                                                                    "pmid",
@@ -690,14 +826,14 @@ class StableDiffusionGGML {
                                                                    PM_VERSION_2);
                 LOG_INFO("using PhotoMaker Version 2");
             } else {
-                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
+                pmid_model = std::make_shared<PhotoMakerIDEncoder>(pmid_backend,
                                                                    offload_params_to_cpu,
                                                                    tensor_storage_map,
                                                                    "pmid",
                                                                    version);
             }
             if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
-                pmid_lora               = std::make_shared<LoraModel>("pmid", backend, sd_ctx_params->photo_maker_path, "", version);
+                pmid_lora               = std::make_shared<LoraModel>("pmid", diffusion_backend, sd_ctx_params->photo_maker_path, "", version);
                 auto lora_tensor_filter = [&](const std::string& tensor_name) {
                     if (starts_with(tensor_name, "lora.model")) {
                         return true;
@@ -817,13 +953,15 @@ class StableDiffusionGGML {
 
             size_t total_params_ram_size  = 0;
             size_t total_params_vram_size = 0;
-            if (ggml_backend_is_cpu(clip_backend)) {
+            
+            // TODO: split by individual text encoders
+            if (ggml_backend_is_cpu(clip_backends[0])) {
                 total_params_ram_size += clip_params_mem_size + pmid_params_mem_size;
             } else {
                 total_params_vram_size += clip_params_mem_size + pmid_params_mem_size;
             }
 
-            if (ggml_backend_is_cpu(backend)) {
+            if (ggml_backend_is_cpu(diffusion_backend)) {
                 total_params_ram_size += unet_params_mem_size;
             } else {
                 total_params_vram_size += unet_params_mem_size;
@@ -849,7 +987,8 @@ class StableDiffusionGGML {
                 total_params_vram_size / 1024.0 / 1024.0,
                 total_params_ram_size / 1024.0 / 1024.0,
                 clip_params_mem_size / 1024.0 / 1024.0,
-                ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM",
+                // TODO: split
+                ggml_backend_is_cpu(clip_backends[0]) ? "RAM" : "VRAM",
                 unet_params_mem_size / 1024.0 / 1024.0,
                 ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
                 vae_params_mem_size / 1024.0 / 1024.0,
@@ -857,7 +996,7 @@ class StableDiffusionGGML {
                 control_net_params_mem_size / 1024.0 / 1024.0,
                 ggml_backend_is_cpu(control_net_backend) ? "RAM" : "VRAM",
                 pmid_params_mem_size / 1024.0 / 1024.0,
-                ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
+                ggml_backend_is_cpu(pmid_backend) ? "RAM" : "VRAM");
         }
 
         // init denoiser
@@ -1052,8 +1191,15 @@ class StableDiffusionGGML {
 
         for (auto& kv : lora_state_diff) {
             int64_t t0 = ggml_time_ms();
-
-            auto lora = load_lora_model_from_file(kv.first, kv.second, backend);
+            // TODO: Fix that
+            bool are_clip_backends_compatible = true;
+            for (auto backend: clip_backends){
+                are_clip_backends_compatible = are_clip_backends_compatible && (diffusion_backend==backend || ggml_backend_is_cpu(backend));
+            }
+            if(!are_clip_backends_compatible){
+                LOG_WARN("Diffusion models and text encoders are running on different backends. This may cause issues when immediately applying LoRAs.");
+            }
+            auto lora = load_lora_model_from_file(kv.first, kv.second, diffusion_backend);
             if (!lora || lora->lora_tensors.empty()) {
                 continue;
             }
@@ -1098,8 +1244,8 @@ class StableDiffusionGGML {
             for (auto& kv : lora_state_diff) {
                 const std::string& lora_id = kv.first;
                 float multiplier           = kv.second;
-
-                auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, lora_tensor_filter);
+                //TODO: split by model
+                auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backends[0], lora_tensor_filter);
                 if (lora && !lora->lora_tensors.empty()) {
                     lora->preprocess_lora_tensors(tensors);
                     cond_stage_lora_models.push_back(lora);
@@ -1131,7 +1277,7 @@ class StableDiffusionGGML {
                 const std::string& lora_name = kv.first;
                 float multiplier             = kv.second;
 
-                auto lora = load_lora_model_from_file(lora_name, multiplier, backend, lora_tensor_filter);
+                auto lora = load_lora_model_from_file(lora_name, multiplier, diffusion_backend, lora_tensor_filter);
                 if (lora && !lora->lora_tensors.empty()) {
                     lora->preprocess_lora_tensors(tensors);
                     diffusion_lora_models.push_back(lora);
@@ -2893,9 +3039,6 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->lora_apply_mode         = LORA_APPLY_AUTO;
     sd_ctx_params->offload_params_to_cpu   = false;
     sd_ctx_params->enable_mmap             = false;
-    sd_ctx_params->keep_clip_on_cpu        = false;
-    sd_ctx_params->keep_control_net_on_cpu = false;
-    sd_ctx_params->keep_vae_on_cpu         = false;
     sd_ctx_params->diffusion_flash_attn    = false;
     sd_ctx_params->circular_x              = false;
     sd_ctx_params->circular_y              = false;
@@ -2910,7 +3053,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
     if (!buf)
         return nullptr;
     buf[0] = '\0';
-
+    // TODO devices
     snprintf(buf + strlen(buf), 4096 - strlen(buf),
              "model_path: %s\n"
              "clip_l_path: %s\n"
@@ -2934,9 +3077,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "sampler_rng_type: %s\n"
              "prediction: %s\n"
              "offload_params_to_cpu: %s\n"
-             "keep_clip_on_cpu: %s\n"
-             "keep_control_net_on_cpu: %s\n"
-             "keep_vae_on_cpu: %s\n"
              "diffusion_flash_attn: %s\n"
              "circular_x: %s\n"
              "circular_y: %s\n"
@@ -2965,9 +3105,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              sd_rng_type_name(sd_ctx_params->sampler_rng_type),
              sd_prediction_name(sd_ctx_params->prediction),
              BOOL_STR(sd_ctx_params->offload_params_to_cpu),
-             BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
-             BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
-             BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
              BOOL_STR(sd_ctx_params->circular_x),
              BOOL_STR(sd_ctx_params->circular_y),
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 8f040d2bd..a10dd7d60 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -183,9 +183,9 @@ typedef struct {
     enum lora_apply_mode_t lora_apply_mode;
     bool offload_params_to_cpu;
     bool enable_mmap;
-    bool keep_clip_on_cpu;
-    bool keep_control_net_on_cpu;
-    bool keep_vae_on_cpu;
+    // bool keep_clip_on_cpu;
+    // bool keep_control_net_on_cpu;
+    // bool keep_vae_on_cpu;
     bool diffusion_flash_attn;
     bool tae_preview_only;
     bool diffusion_conv_direct;
@@ -198,6 +198,14 @@ typedef struct {
     int chroma_t5_mask_pad;
     bool qwen_image_zero_cond_t;
     float flow_shift;
+    const char* main_device;
+    const char* diffusion_device;
+    const char* clip_device;
+    const char* vae_device;
+    const char* tae_device;
+    const char* control_net_device;
+    const char* photomaker_device;
+    const char* vision_device;
 } sd_ctx_params_t;
 
 typedef struct {
@@ -377,7 +385,8 @@ SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
                                         bool offload_params_to_cpu,
                                         bool direct,
                                         int n_threads,
-                                        int tile_size);
+                                        int tile_size,
+                                        const char * device);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
@@ -403,6 +412,11 @@ SD_API bool preprocess_canny(sd_image_t image,
 SD_API const char* sd_commit(void);
 SD_API const char* sd_version(void);
 
+SD_API size_t backend_list_size(void);
+SD_API void list_backends_to_buffer(char* buffer, size_t buffer_size);
+
+SD_API void add_rpc_device(const char* address);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/upscaler.cpp b/upscaler.cpp
index 29ac981e6..ea198f166 100644
--- a/upscaler.cpp
+++ b/upscaler.cpp
@@ -22,37 +22,20 @@ struct UpscalerGGML {
 
     bool load_from_file(const std::string& esrgan_path,
                         bool offload_params_to_cpu,
-                        int n_threads) {
+                        int n_threads,
+                        std::string device = "") {
         ggml_log_set(ggml_log_callback_default, nullptr);
-#ifdef SD_USE_CUDA
-        LOG_DEBUG("Using CUDA backend");
-        backend = ggml_backend_cuda_init(0);
-#endif
-#ifdef SD_USE_METAL
-        LOG_DEBUG("Using Metal backend");
-        backend = ggml_backend_metal_init();
-#endif
-#ifdef SD_USE_VULKAN
-        LOG_DEBUG("Using Vulkan backend");
-        backend = ggml_backend_vk_init(0);
-#endif
-#ifdef SD_USE_OPENCL
-        LOG_DEBUG("Using OpenCL backend");
-        backend = ggml_backend_opencl_init();
-#endif
-#ifdef SD_USE_SYCL
-        LOG_DEBUG("Using SYCL backend");
-        backend = ggml_backend_sycl_init(0);
-#endif
+        device = sanitize_backend_name(device);
+        backend = init_named_backend(device);
         ModelLoader model_loader;
         if (!model_loader.init_from_file_and_convert_name(esrgan_path)) {
             LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str());
         }
         model_loader.set_wtype_override(model_data_type);
-        if (!backend) {
-            LOG_DEBUG("Using CPU backend");
-            backend = ggml_backend_cpu_init();
-        }
+        // if (!backend) {
+        //     LOG_DEBUG("Using CPU backend");
+        //     backend = ggml_backend_cpu_init();
+        // }
         LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
         esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
         if (direct) {
@@ -117,7 +100,8 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
                                  bool offload_params_to_cpu,
                                  bool direct,
                                  int n_threads,
-                                 int tile_size) {
+                                 int tile_size,
+                                 const char* device) {
     upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
     if (upscaler_ctx == nullptr) {
         return nullptr;
@@ -129,7 +113,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
         return nullptr;
     }
 
-    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {
+    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads, SAFE_STR(device))) {
         delete upscaler_ctx->upscaler;
         upscaler_ctx->upscaler = nullptr;
         free(upscaler_ctx);
diff --git a/z_image.hpp b/z_image.hpp
index cee23833a..ef1e48dff 100644
--- a/z_image.hpp
+++ b/z_image.hpp
@@ -7,6 +7,14 @@
 #include "ggml_extend.hpp"
 #include "mmdit.hpp"
 
+#ifdef SD_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
+#if GGML_USE_HIP
+#include "ggml-cuda.h"
+#endif
+
 // Ref: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py
 // Ref: https://github.com/huggingface/diffusers/pull/12703
 
@@ -31,10 +39,6 @@ namespace ZImage {
             : head_dim(head_dim), num_heads(num_heads), num_kv_heads(num_kv_heads), qk_norm(qk_norm) {
             blocks["qkv"] = std::make_shared<Linear>(hidden_size, (num_heads + num_kv_heads * 2) * head_dim, false);
             float scale   = 1.f;
-#if GGML_USE_HIP
-            // Prevent NaN issues with certain ROCm setups
-            scale = 1.f / 16.f;
-#endif
             blocks["out"] = std::make_shared<Linear>(num_heads * head_dim, hidden_size, false, false, false, scale);
             if (qk_norm) {
                 blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim);
@@ -51,6 +55,12 @@ namespace ZImage {
             int64_t N       = x->ne[2];
             auto qkv_proj   = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
             auto out_proj   = std::dynamic_pointer_cast<Linear>(blocks["out"]);
+#if GGML_USE_HIP
+            // Prevent NaN issues with certain ROCm setups
+            if (ggml_backend_is_cuda(ctx->backend)) {
+                out_proj->set_scale(1.f / 16.f);
+            }
+#endif
 
             auto qkv = qkv_proj->forward(ctx, x);                                                                            // [N, n_token, (num_heads + num_kv_heads*2)*head_dim]
             qkv      = ggml_reshape_4d(ctx->ggml_ctx, qkv, head_dim, num_heads + num_kv_heads * 2, qkv->ne[1], qkv->ne[2]);  // [N, n_token, num_heads + num_kv_heads*2, head_dim]
@@ -115,9 +125,7 @@ namespace ZImage {
 
             bool force_prec_f32 = false;
             float scale         = 1.f / 128.f;
-#ifdef SD_USE_VULKAN
-            force_prec_f32 = true;
-#endif
+
             // The purpose of the scale here is to prevent NaN issues in certain situations.
             // For example, when using CUDA but the weights are k-quants.
             blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, scale);
@@ -128,6 +136,11 @@ namespace ZImage {
             auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
             auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
             auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
+#ifdef SD_USE_VULKAN
+            if(ggml_backend_is_vk(ctx->backend)){
+                w2->set_force_prec_f32(true);
+            }
+#endif
 
             auto x1 = w1->forward(ctx, x);
             auto x3 = w3->forward(ctx, x);