Skip to content

Commit 7a2a7d0

Browse files
committed
rename qwenvl to llm
1 parent 66e27de commit 7a2a7d0

File tree

10 files changed

+60
-52
lines changed

10 files changed

+60
-52
lines changed

conditioner.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#define __CONDITIONER_HPP__
33

44
#include "clip.hpp"
5-
#include "qwenvl.hpp"
5+
#include "llm.hpp"
66
#include "t5.hpp"
77

88
struct SDCondition {
@@ -1648,12 +1648,12 @@ struct LLMEmbedder : public Conditioner {
16481648
backend,
16491649
offload_params_to_cpu,
16501650
tensor_storage_map,
1651-
"text_encoders.qwen2vl",
1651+
"text_encoders.llm",
16521652
enable_vision);
16531653
}
16541654

16551655
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
1656-
llm->get_param_tensors(tensors, "text_encoders.qwen2vl");
1656+
llm->get_param_tensors(tensors, "text_encoders.llm");
16571657
}
16581658

16591659
void alloc_params_buffer() override {

docs/qwen_image.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
## Examples
1515

1616
```
17-
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线: 探索视觉生成基础模型的极限,开创理解与生成一体化的未来。二、Qwen-Image的模型特色:1、复杂文字渲染。支持中英渲染、自动布局; 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景:赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
17+
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线: 探索视觉生成基础模型的极限,开创理解与生成一体化的未来。二、Qwen-Image的模型特色:1、复杂文字渲染。支持中英渲染、自动布局; 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景:赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
1818
```
1919

2020
<img alt="qwen example" src="../assets/qwen/example.png" />

docs/qwen_image_edit.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
### Qwen Image Edit
2121

2222
```
23-
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
23+
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
2424
```
2525

2626
<img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
@@ -29,7 +29,7 @@
2929
### Qwen Image Edit 2509
3030

3131
```
32-
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --qwen2vl_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
32+
.\bin\Release\sd.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
3333
```
3434

3535
<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />

examples/cli/README.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@ Options:
99
--clip_g <string> path to the clip-g text encoder
1010
--clip_vision <string> path to the clip-vision encoder
1111
--t5xxl <string> path to the t5xxl text encoder
12-
--qwen2vl <string> path to the qwen2vl text encoder
13-
--qwen2vl_vision <string> path to the qwen2vl vit
12+
--llm <string> path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
13+
--llm_vision <string> path to the llm vit
14+
--qwen2vl <string> alias of --llm. Deprecated.
15+
--qwen2vl_vision <string> alias of --llm_vision. Deprecated.
1416
--diffusion-model <string> path to the standalone diffusion model
1517
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model
1618
--vae <string> path to standalone vae model
@@ -33,7 +35,6 @@ Options:
3335
-p, --prompt <string> the prompt to render
3436
-n, --negative-prompt <string> the negative prompt (default: "")
3537
--preview-path <string> path to write preview image to (default: ./preview.png)
36-
--easycache <string> enable EasyCache for DiT models, accepts optional "threshold,start_percent,end_percent" values (defaults to 0.2,0.15,0.95)
3738
--upscale-model <string> path to esrgan model.
3839
-t, --threads <int> number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
3940
CPU physical cores
@@ -105,20 +106,19 @@ Options:
105106
contain any quantized parameters, the at_runtime mode will be used; otherwise,
106107
immediately will be used.The immediately mode may have precision and
107108
compatibility issues with quantized parameters, but it usually offers faster inference
108-
speed and, in some cases, lower memory usage. The at_runtime mode, on the other
109-
hand, is exactly the opposite.
109+
speed and, in some cases, lower memory usage. The at_runtime mode, on the
110+
other hand, is exactly the opposite.
110111
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
111112
default: discrete
112113
--skip-layers layers to skip for SLG steps (default: [7,8,9])
113114
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
114115
ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
115-
--high-noise-scheduler (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform,
116-
simple], default: discrete
117116
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
118117
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
119118
-h, --help show this help message and exit
120119
--vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32)
121120
--vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
122121
(overrides --vae-tile-size)
123122
--preview preview method. must be one of the following [none, proj, tae, vae] (default is none)
123+
--easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
124124
```

examples/cli/main.cpp

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@ struct SDParams {
7070
std::string clip_g_path;
7171
std::string clip_vision_path;
7272
std::string t5xxl_path;
73-
std::string qwen2vl_path;
74-
std::string qwen2vl_vision_path;
73+
std::string llm_path;
74+
std::string llm_vision_path;
7575
std::string diffusion_model_path;
7676
std::string high_noise_diffusion_model_path;
7777
std::string vae_path;
@@ -174,8 +174,8 @@ void print_params(SDParams params) {
174174
printf(" clip_g_path: %s\n", params.clip_g_path.c_str());
175175
printf(" clip_vision_path: %s\n", params.clip_vision_path.c_str());
176176
printf(" t5xxl_path: %s\n", params.t5xxl_path.c_str());
177-
printf(" qwen2vl_path: %s\n", params.qwen2vl_path.c_str());
178-
printf(" qwen2vl_vision_path: %s\n", params.qwen2vl_vision_path.c_str());
177+
printf(" llm_path: %s\n", params.llm_path.c_str());
178+
printf(" llm_vision_path: %s\n", params.llm_vision_path.c_str());
179179
printf(" diffusion_model_path: %s\n", params.diffusion_model_path.c_str());
180180
printf(" high_noise_diffusion_model_path: %s\n", params.high_noise_diffusion_model_path.c_str());
181181
printf(" vae_path: %s\n", params.vae_path.c_str());
@@ -532,14 +532,22 @@ void parse_args(int argc, const char** argv, SDParams& params) {
532532
"--t5xxl",
533533
"path to the t5xxl text encoder",
534534
&params.t5xxl_path},
535+
{"",
536+
"--llm",
537+
"path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
538+
&params.llm_path},
539+
{"",
540+
"--llm_vision",
541+
"path to the llm vit",
542+
&params.llm_vision_path},
535543
{"",
536544
"--qwen2vl",
537-
"path to the qwen2vl text encoder",
538-
&params.qwen2vl_path},
545+
"alias of --llm. Deprecated.",
546+
&params.llm_path},
539547
{"",
540548
"--qwen2vl_vision",
541-
"path to the qwen2vl vit",
542-
&params.qwen2vl_vision_path},
549+
"alias of --llm_vision. Deprecated.",
550+
&params.llm_vision_path},
543551
{"",
544552
"--diffusion-model",
545553
"path to the standalone diffusion model",
@@ -1230,7 +1238,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
12301238
on_relative_tile_size_arg},
12311239
{"",
12321240
"--preview",
1233-
std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n",
1241+
std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")",
12341242
on_preview_arg},
12351243
{"",
12361244
"--easycache",
@@ -1428,7 +1436,7 @@ std::string get_image_params(SDParams params, int64_t seed) {
14281436
parameter_string += " " + std::string(sd_scheduler_name(params.sample_params.scheduler));
14291437
}
14301438
parameter_string += ", ";
1431-
for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.qwen2vl_path, params.qwen2vl_vision_path}) {
1439+
for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.llm_path, params.llm_vision_path}) {
14321440
if (!te.empty()) {
14331441
parameter_string += "TE: " + sd_basename(te) + ", ";
14341442
}
@@ -1845,8 +1853,8 @@ int main(int argc, const char* argv[]) {
18451853
params.clip_g_path.c_str(),
18461854
params.clip_vision_path.c_str(),
18471855
params.t5xxl_path.c_str(),
1848-
params.qwen2vl_path.c_str(),
1849-
params.qwen2vl_vision_path.c_str(),
1856+
params.llm_path.c_str(),
1857+
params.llm_vision_path.c_str(),
18501858
params.diffusion_model_path.c_str(),
18511859
params.high_noise_diffusion_model_path.c_str(),
18521860
params.vae_path.c_str(),

qwenvl.hpp renamed to llm.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1549,7 +1549,7 @@ namespace LLM {
15491549
ggml_type model_data_type = GGML_TYPE_COUNT;
15501550

15511551
ModelLoader model_loader;
1552-
if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.qwen2vl.")) {
1552+
if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.llm.")) {
15531553
LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
15541554
return;
15551555
}
@@ -1569,12 +1569,12 @@ namespace LLM {
15691569
backend,
15701570
true,
15711571
tensor_storage_map,
1572-
"text_encoders.qwen2vl",
1572+
"text_encoders.llm",
15731573
true);
15741574

15751575
llm->alloc_params_buffer();
15761576
std::map<std::string, ggml_tensor*> tensors;
1577-
llm->get_param_tensors(tensors, "text_encoders.qwen2vl");
1577+
llm->get_param_tensors(tensors, "text_encoders.llm");
15781578

15791579
bool success = model_loader.load_tensors(tensors);
15801580

model.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,8 @@ const char* unused_tensors[] = {
105105
"denoiser.sigmas",
106106
"edm_vpred.sigma_max",
107107
"text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training
108-
"text_encoders.qwen2vl.output.weight",
109-
"text_encoders.qwen2vl.lm_head.",
108+
"text_encoders.llm.output.weight",
109+
"text_encoders.llm.lm_head.",
110110
"first_stage_model.bn.",
111111
};
112112

name_conversion.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
127127
{"token_embd.", "shared."},
128128
};
129129

130-
static const std::vector<std::pair<std::string, std::string>> qwenvl_name_map{
130+
static const std::vector<std::pair<std::string, std::string>> llm_name_map{
131131
{"token_embd.", "model.embed_tokens."},
132132
{"blk.", "model.layers."},
133133
{"attn_q.", "self_attn.q_proj."},
@@ -142,7 +142,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
142142
{"output_norm.", "model.norm."},
143143
};
144144

145-
static const std::vector<std::pair<std::string, std::string>> qwenvl_vision_name_map{
145+
static const std::vector<std::pair<std::string, std::string>> llm_vision_name_map{
146146
{"mm.", "merger.mlp."},
147147
{"v.post_ln.", "merger.ln_q."},
148148
{"v.patch_embd.weight", "patch_embed.proj.0.weight"},
@@ -161,11 +161,11 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
161161
};
162162
if (contains(name, "t5xxl")) {
163163
replace_with_name_map(name, t5_name_map);
164-
} else if (contains(name, "qwen2vl")) {
165-
if (contains(name, "qwen2vl.visual")) {
166-
replace_with_name_map(name, qwenvl_vision_name_map);
164+
} else if (contains(name, "llm")) {
165+
if (contains(name, "llm.visual")) {
166+
replace_with_name_map(name, llm_vision_name_map);
167167
} else {
168-
replace_with_name_map(name, qwenvl_name_map);
168+
replace_with_name_map(name, llm_name_map);
169169
}
170170
} else {
171171
name = convert_open_clip_to_hf_clip_name(name);

stable-diffusion.cpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -276,17 +276,17 @@ class StableDiffusionGGML {
276276
}
277277
}
278278

279-
if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_path)) > 0) {
280-
LOG_INFO("loading qwen2vl from '%s'", sd_ctx_params->qwen2vl_path);
281-
if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_path, "text_encoders.qwen2vl.")) {
282-
LOG_WARN("loading qwen2vl from '%s' failed", sd_ctx_params->qwen2vl_path);
279+
if (strlen(SAFE_STR(sd_ctx_params->llm_path)) > 0) {
280+
LOG_INFO("loading llm from '%s'", sd_ctx_params->llm_path);
281+
if (!model_loader.init_from_file(sd_ctx_params->llm_path, "text_encoders.llm.")) {
282+
LOG_WARN("loading llm from '%s' failed", sd_ctx_params->llm_path);
283283
}
284284
}
285285

286-
if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_vision_path)) > 0) {
287-
LOG_INFO("loading qwen2vl vision from '%s'", sd_ctx_params->qwen2vl_vision_path);
288-
if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_vision_path, "text_encoders.qwen2vl.visual.")) {
289-
LOG_WARN("loading qwen2vl vision from '%s' failed", sd_ctx_params->qwen2vl_vision_path);
286+
if (strlen(SAFE_STR(sd_ctx_params->llm_vision_path)) > 0) {
287+
LOG_INFO("loading llm vision from '%s'", sd_ctx_params->llm_vision_path);
288+
if (!model_loader.init_from_file(sd_ctx_params->llm_vision_path, "text_encoders.llm.visual.")) {
289+
LOG_WARN("loading llm vision from '%s' failed", sd_ctx_params->llm_vision_path);
290290
}
291291
}
292292

@@ -307,7 +307,7 @@ class StableDiffusionGGML {
307307

308308
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
309309
for (auto& [name, tensor_storage] : tensor_storage_map) {
310-
if (contains(name, "qwen2vl") &&
310+
if (contains(name, "llm") &&
311311
ends_with(name, "weight") &&
312312
(tensor_storage.type == GGML_TYPE_F32 || tensor_storage.type == GGML_TYPE_BF16)) {
313313
tensor_storage.expected_type = GGML_TYPE_F16;
@@ -684,7 +684,7 @@ class StableDiffusionGGML {
684684
ignore_tensors.insert("first_stage_model.encoder");
685685
ignore_tensors.insert("first_stage_model.conv1");
686686
ignore_tensors.insert("first_stage_model.quant");
687-
ignore_tensors.insert("text_encoders.qwen2vl.visual.");
687+
ignore_tensors.insert("text_encoders.llm.visual.");
688688
}
689689
if (version == VERSION_SVD) {
690690
ignore_tensors.insert("conditioner.embedders.3");
@@ -2465,8 +2465,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
24652465
"clip_g_path: %s\n"
24662466
"clip_vision_path: %s\n"
24672467
"t5xxl_path: %s\n"
2468-
"qwen2vl_path: %s\n"
2469-
"qwen2vl_vision_path: %s\n"
2468+
"llm_path: %s\n"
2469+
"llm_vision_path: %s\n"
24702470
"diffusion_model_path: %s\n"
24712471
"high_noise_diffusion_model_path: %s\n"
24722472
"vae_path: %s\n"
@@ -2496,8 +2496,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
24962496
SAFE_STR(sd_ctx_params->clip_g_path),
24972497
SAFE_STR(sd_ctx_params->clip_vision_path),
24982498
SAFE_STR(sd_ctx_params->t5xxl_path),
2499-
SAFE_STR(sd_ctx_params->qwen2vl_path),
2500-
SAFE_STR(sd_ctx_params->qwen2vl_vision_path),
2499+
SAFE_STR(sd_ctx_params->llm_path),
2500+
SAFE_STR(sd_ctx_params->llm_vision_path),
25012501
SAFE_STR(sd_ctx_params->diffusion_model_path),
25022502
SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
25032503
SAFE_STR(sd_ctx_params->vae_path),

0 commit comments

Comments
 (0)