rename qwenvl to llm

leejet · leejet · commit 7a2a7d076784 · 2025-11-29T14:06:46.000+08:00
diff --git a/conditioner.hpp b/conditioner.hpp
@@ -2,7 +2,7 @@
 #define __CONDITIONER_HPP__
 
 #include "clip.hpp"
-#include "qwenvl.hpp"
+#include "llm.hpp"
 #include "t5.hpp"
 
 struct SDCondition {
@@ -1648,12 +1648,12 @@ struct LLMEmbedder : public Conditioner {
                                                backend,
                                                offload_params_to_cpu,
                                                tensor_storage_map,
-                                               "text_encoders.qwen2vl",
+                                               "text_encoders.llm",
                                                enable_vision);
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
-        llm->get_param_tensors(tensors, "text_encoders.qwen2vl");
+        llm->get_param_tensors(tensors, "text_encoders.llm");
     }
 
     void alloc_params_buffer() override {
diff --git a/docs/qwen_image.md b/docs/qwen_image.md
@@ -14,7 +14,7 @@
 ## Examples
 
 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf  -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线： 探索视觉生成基础模型的极限，开创理解与生成一体化的未来。二、Qwen-Image的模型特色：1、复杂文字渲染。支持中英渲染、自动布局； 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景：赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf  -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线： 探索视觉生成基础模型的极限，开创理解与生成一体化的未来。二、Qwen-Image的模型特色：1、复杂文字渲染。支持中英渲染、自动布局； 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景：赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
 ```
 
 <img alt="qwen example" src="../assets/qwen/example.png" />
diff --git a/docs/qwen_image_edit.md b/docs/qwen_image_edit.md
@@ -20,7 +20,7 @@
 ### Qwen Image Edit
 
 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --qwen2vl ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
 ```
 
 <img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
@@ -29,7 +29,7 @@
 ### Qwen Image Edit 2509
 
 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --qwen2vl_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
 ```
 
 <img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />
diff --git a/examples/cli/README.md b/examples/cli/README.md
@@ -9,8 +9,10 @@ Options:
   --clip_g <string>                        path to the clip-g text encoder
   --clip_vision <string>                   path to the clip-vision encoder
   --t5xxl <string>                         path to the t5xxl text encoder
-  --qwen2vl <string>                       path to the qwen2vl text encoder
-  --qwen2vl_vision <string>                path to the qwen2vl vit
+  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
+  --llm_vision <string>                    path to the llm vit
+  --qwen2vl <string>                       alias of --llm. Deprecated.
+  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
   --diffusion-model <string>               path to the standalone diffusion model
   --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
   --vae <string>                           path to standalone vae model
@@ -33,7 +35,6 @@ Options:
   -p, --prompt <string>                    the prompt to render
   -n, --negative-prompt <string>           the negative prompt (default: "")
   --preview-path <string>                  path to write preview image to (default: ./preview.png)
-  --easycache <string>                     enable EasyCache for DiT models, accepts optional "threshold,start_percent,end_percent" values (defaults to 0.2,0.15,0.95)
   --upscale-model <string>                 path to esrgan model.
   -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
                                            CPU physical cores
@@ -105,20 +106,19 @@ Options:
                                            contain any quantized parameters, the at_runtime mode will be used; otherwise,
                                            immediately will be used.The immediately mode may have precision and
                                            compatibility issues with quantized parameters, but it usually offers faster inference
-                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the other
-                                           hand, is exactly the opposite.
+                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
+                                           other hand, is exactly the opposite.
   --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
                                            default: discrete
   --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
   --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
                                            ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
-  --high-noise-scheduler                   (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform,
-                                           simple], default: discrete
   --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
   -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
   -h, --help                               show this help message and exit
   --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
   --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
                                            (overrides --vae-tile-size)
   --preview                                preview method. must be one of the following [none, proj, tae, vae] (default is none)
+  --easycache                              enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
 ```
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -70,8 +70,8 @@ struct SDParams {
     std::string clip_g_path;
     std::string clip_vision_path;
     std::string t5xxl_path;
-    std::string qwen2vl_path;
-    std::string qwen2vl_vision_path;
+    std::string llm_path;
+    std::string llm_vision_path;
     std::string diffusion_model_path;
     std::string high_noise_diffusion_model_path;
     std::string vae_path;
@@ -174,8 +174,8 @@ void print_params(SDParams params) {
     printf("    clip_g_path:                       %s\n", params.clip_g_path.c_str());
     printf("    clip_vision_path:                  %s\n", params.clip_vision_path.c_str());
     printf("    t5xxl_path:                        %s\n", params.t5xxl_path.c_str());
-    printf("    qwen2vl_path:                      %s\n", params.qwen2vl_path.c_str());
-    printf("    qwen2vl_vision_path:               %s\n", params.qwen2vl_vision_path.c_str());
+    printf("    llm_path:                          %s\n", params.llm_path.c_str());
+    printf("    llm_vision_path:                   %s\n", params.llm_vision_path.c_str());
     printf("    diffusion_model_path:              %s\n", params.diffusion_model_path.c_str());
     printf("    high_noise_diffusion_model_path:   %s\n", params.high_noise_diffusion_model_path.c_str());
     printf("    vae_path:                          %s\n", params.vae_path.c_str());
@@ -532,14 +532,22 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          "--t5xxl",
          "path to the t5xxl text encoder",
          &params.t5xxl_path},
+        {"",
+         "--llm",
+         "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
+         &params.llm_path},
+        {"",
+         "--llm_vision",
+         "path to the llm vit",
+         &params.llm_vision_path},
         {"",
          "--qwen2vl",
-         "path to the qwen2vl text encoder",
-         &params.qwen2vl_path},
+         "alias of --llm. Deprecated.",
+         &params.llm_path},
         {"",
          "--qwen2vl_vision",
-         "path to the qwen2vl vit",
-         &params.qwen2vl_vision_path},
+         "alias of --llm_vision. Deprecated.",
+         &params.llm_vision_path},
         {"",
          "--diffusion-model",
          "path to the standalone diffusion model",
@@ -1230,7 +1238,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          on_relative_tile_size_arg},
         {"",
          "--preview",
-         std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n",
+         std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")",
          on_preview_arg},
         {"",
          "--easycache",
@@ -1428,7 +1436,7 @@ std::string get_image_params(SDParams params, int64_t seed) {
         parameter_string += " " + std::string(sd_scheduler_name(params.sample_params.scheduler));
     }
     parameter_string += ", ";
-    for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.qwen2vl_path, params.qwen2vl_vision_path}) {
+    for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.llm_path, params.llm_vision_path}) {
         if (!te.empty()) {
             parameter_string += "TE: " + sd_basename(te) + ", ";
         }
@@ -1845,8 +1853,8 @@ int main(int argc, const char* argv[]) {
         params.clip_g_path.c_str(),
         params.clip_vision_path.c_str(),
         params.t5xxl_path.c_str(),
-        params.qwen2vl_path.c_str(),
-        params.qwen2vl_vision_path.c_str(),
+        params.llm_path.c_str(),
+        params.llm_vision_path.c_str(),
         params.diffusion_model_path.c_str(),
         params.high_noise_diffusion_model_path.c_str(),
         params.vae_path.c_str(),
diff --git a/llm.hpp b/llm.hpp
@@ -1549,7 +1549,7 @@ namespace LLM {
             ggml_type model_data_type = GGML_TYPE_COUNT;
 
             ModelLoader model_loader;
-            if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.qwen2vl.")) {
+            if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.llm.")) {
                 LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                 return;
             }
@@ -1569,12 +1569,12 @@ namespace LLM {
                                                                              backend,
                                                                              true,
                                                                              tensor_storage_map,
-                                                                             "text_encoders.qwen2vl",
+                                                                             "text_encoders.llm",
                                                                              true);
 
             llm->alloc_params_buffer();
             std::map<std::string, ggml_tensor*> tensors;
-            llm->get_param_tensors(tensors, "text_encoders.qwen2vl");
+            llm->get_param_tensors(tensors, "text_encoders.llm");
 
             bool success = model_loader.load_tensors(tensors);
 
diff --git a/model.cpp b/model.cpp
@@ -105,8 +105,8 @@ const char* unused_tensors[] = {
     "denoiser.sigmas",
     "edm_vpred.sigma_max",
     "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight",  // only used during training
-    "text_encoders.qwen2vl.output.weight",
-    "text_encoders.qwen2vl.lm_head.",
+    "text_encoders.llm.output.weight",
+    "text_encoders.llm.lm_head.",
     "first_stage_model.bn.",
 };
 
diff --git a/name_conversion.cpp b/name_conversion.cpp
@@ -127,7 +127,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
         {"token_embd.", "shared."},
     };
 
-    static const std::vector<std::pair<std::string, std::string>> qwenvl_name_map{
+    static const std::vector<std::pair<std::string, std::string>> llm_name_map{
         {"token_embd.", "model.embed_tokens."},
         {"blk.", "model.layers."},
         {"attn_q.", "self_attn.q_proj."},
@@ -142,7 +142,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
         {"output_norm.", "model.norm."},
     };
 
-    static const std::vector<std::pair<std::string, std::string>> qwenvl_vision_name_map{
+    static const std::vector<std::pair<std::string, std::string>> llm_vision_name_map{
         {"mm.", "merger.mlp."},
         {"v.post_ln.", "merger.ln_q."},
         {"v.patch_embd.weight", "patch_embed.proj.0.weight"},
@@ -161,11 +161,11 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
     };
     if (contains(name, "t5xxl")) {
         replace_with_name_map(name, t5_name_map);
-    } else if (contains(name, "qwen2vl")) {
-        if (contains(name, "qwen2vl.visual")) {
-            replace_with_name_map(name, qwenvl_vision_name_map);
+    } else if (contains(name, "llm")) {
+        if (contains(name, "llm.visual")) {
+            replace_with_name_map(name, llm_vision_name_map);
         } else {
-            replace_with_name_map(name, qwenvl_name_map);
+            replace_with_name_map(name, llm_name_map);
         }
     } else {
         name = convert_open_clip_to_hf_clip_name(name);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -276,17 +276,17 @@ class StableDiffusionGGML {
             }
         }
 
-        if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_path)) > 0) {
-            LOG_INFO("loading qwen2vl from '%s'", sd_ctx_params->qwen2vl_path);
-            if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_path, "text_encoders.qwen2vl.")) {
-                LOG_WARN("loading qwen2vl from '%s' failed", sd_ctx_params->qwen2vl_path);
+        if (strlen(SAFE_STR(sd_ctx_params->llm_path)) > 0) {
+            LOG_INFO("loading llm from '%s'", sd_ctx_params->llm_path);
+            if (!model_loader.init_from_file(sd_ctx_params->llm_path, "text_encoders.llm.")) {
+                LOG_WARN("loading llm from '%s' failed", sd_ctx_params->llm_path);
             }
         }
 
-        if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_vision_path)) > 0) {
-            LOG_INFO("loading qwen2vl vision from '%s'", sd_ctx_params->qwen2vl_vision_path);
-            if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_vision_path, "text_encoders.qwen2vl.visual.")) {
-                LOG_WARN("loading qwen2vl vision from '%s' failed", sd_ctx_params->qwen2vl_vision_path);
+        if (strlen(SAFE_STR(sd_ctx_params->llm_vision_path)) > 0) {
+            LOG_INFO("loading llm vision from '%s'", sd_ctx_params->llm_vision_path);
+            if (!model_loader.init_from_file(sd_ctx_params->llm_vision_path, "text_encoders.llm.visual.")) {
+                LOG_WARN("loading llm vision from '%s' failed", sd_ctx_params->llm_vision_path);
             }
         }
 
@@ -307,7 +307,7 @@ class StableDiffusionGGML {
 
         auto& tensor_storage_map = model_loader.get_tensor_storage_map();
         for (auto& [name, tensor_storage] : tensor_storage_map) {
-            if (contains(name, "qwen2vl") &&
+            if (contains(name, "llm") &&
                 ends_with(name, "weight") &&
                 (tensor_storage.type == GGML_TYPE_F32 || tensor_storage.type == GGML_TYPE_BF16)) {
                 tensor_storage.expected_type = GGML_TYPE_F16;
@@ -684,7 +684,7 @@ class StableDiffusionGGML {
             ignore_tensors.insert("first_stage_model.encoder");
             ignore_tensors.insert("first_stage_model.conv1");
             ignore_tensors.insert("first_stage_model.quant");
-            ignore_tensors.insert("text_encoders.qwen2vl.visual.");
+            ignore_tensors.insert("text_encoders.llm.visual.");
         }
         if (version == VERSION_SVD) {
             ignore_tensors.insert("conditioner.embedders.3");
@@ -2465,8 +2465,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "clip_g_path: %s\n"
              "clip_vision_path: %s\n"
              "t5xxl_path: %s\n"
-             "qwen2vl_path: %s\n"
-             "qwen2vl_vision_path: %s\n"
+             "llm_path: %s\n"
+             "llm_vision_path: %s\n"
              "diffusion_model_path: %s\n"
              "high_noise_diffusion_model_path: %s\n"
              "vae_path: %s\n"
@@ -2496,8 +2496,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              SAFE_STR(sd_ctx_params->clip_g_path),
              SAFE_STR(sd_ctx_params->clip_vision_path),
              SAFE_STR(sd_ctx_params->t5xxl_path),
-             SAFE_STR(sd_ctx_params->qwen2vl_path),
-             SAFE_STR(sd_ctx_params->qwen2vl_vision_path),
+             SAFE_STR(sd_ctx_params->llm_path),
+             SAFE_STR(sd_ctx_params->llm_vision_path),
              SAFE_STR(sd_ctx_params->diffusion_model_path),
              SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
              SAFE_STR(sd_ctx_params->vae_path),
diff --git a/stable-diffusion.h b/stable-diffusion.h