Skip to content

Commit 513d9bf

Browse files
author
Stéphane du Hamel
committed
multiple clip backend devices
1 parent 13bc938 commit 513d9bf

File tree

3 files changed

+137
-40
lines changed

3 files changed

+137
-40
lines changed

conditioner.hpp

Lines changed: 64 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
#define __CONDITIONER_HPP__
33

44
#include "clip.hpp"
5+
#include "ggml-alloc.h"
6+
#include "ggml-backend.h"
57
#include "llm.hpp"
68
#include "t5.hpp"
9+
#include "util.h"
710

811
struct SDCondition {
912
struct ggml_tensor* c_crossattn = nullptr; // aka context
@@ -62,7 +65,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
6265
std::vector<uint8_t> token_embed_custom;
6366
std::map<std::string, std::pair<int, int>> embedding_pos_map;
6467

65-
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
68+
FrozenCLIPEmbedderWithCustomWords(std::vector<ggml_backend_t> backends,
6669
bool offload_params_to_cpu,
6770
const String2TensorStorage& tensor_storage_map,
6871
const std::map<std::string, std::string>& orig_embedding_map,
@@ -76,13 +79,27 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
7679
tokenizer.add_special_token(name);
7780
}
7881
bool force_clip_f32 = !embedding_map.empty();
82+
83+
ggml_backend_t clip_backend = backends[0];
84+
7985
if (sd_version_is_sd1(version)) {
80-
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
86+
LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend));
87+
text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
8188
} else if (sd_version_is_sd2(version)) {
82-
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
89+
LOG_INFO("CLIP-H: using %s backend", ggml_backend_name(clip_backend));
90+
text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
8391
} else if (sd_version_is_sdxl(version)) {
84-
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
85-
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
92+
ggml_backend_t clip_g_backend = clip_backend;
93+
if (backends.size() >= 2){
94+
clip_g_backend = backends[1];
95+
if (backends.size() > 2) {
96+
LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
97+
}
98+
}
99+
LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend));
100+
LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend));
101+
text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
102+
text_model2 = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
86103
}
87104
}
88105

@@ -702,13 +719,29 @@ struct SD3CLIPEmbedder : public Conditioner {
702719
std::shared_ptr<CLIPTextModelRunner> clip_g;
703720
std::shared_ptr<T5Runner> t5;
704721

705-
SD3CLIPEmbedder(ggml_backend_t backend,
722+
SD3CLIPEmbedder(std::vector<ggml_backend_t> backends,
706723
bool offload_params_to_cpu,
707724
const String2TensorStorage& tensor_storage_map = {})
708725
: clip_g_tokenizer(0) {
709726
bool use_clip_l = false;
710727
bool use_clip_g = false;
711728
bool use_t5 = false;
729+
730+
ggml_backend_t clip_l_backend, clip_g_backend, t5_backend;
731+
if (backends.size() == 1) {
732+
clip_l_backend = clip_g_backend = t5_backend = backends[0];
733+
} else if (backends.size() == 2) {
734+
clip_l_backend = clip_g_backend = backends[0];
735+
t5_backend = backends[1];
736+
} else if (backends.size() >= 3) {
737+
clip_l_backend = backends[0];
738+
clip_g_backend = backends[1];
739+
t5_backend = backends[2];
740+
if (backends.size() > 3) {
741+
LOG_WARN("More than 3 clip backends provided, but the model only supports 3 text encoders. Ignoring the rest.");
742+
}
743+
}
744+
712745
for (auto pair : tensor_storage_map) {
713746
if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
714747
use_clip_l = true;
@@ -723,13 +756,16 @@ struct SD3CLIPEmbedder : public Conditioner {
723756
return;
724757
}
725758
if (use_clip_l) {
726-
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
759+
LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend));
760+
clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
727761
}
728762
if (use_clip_g) {
729-
clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
763+
LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend));
764+
clip_g = std::make_shared<CLIPTextModelRunner>(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
730765
}
731766
if (use_t5) {
732-
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
767+
LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(clip_l_backend));
768+
t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
733769
}
734770
}
735771

@@ -1123,11 +1159,25 @@ struct FluxCLIPEmbedder : public Conditioner {
11231159
std::shared_ptr<T5Runner> t5;
11241160
size_t chunk_len = 256;
11251161

1126-
FluxCLIPEmbedder(ggml_backend_t backend,
1162+
FluxCLIPEmbedder(std::vector<ggml_backend_t> backends,
11271163
bool offload_params_to_cpu,
11281164
const String2TensorStorage& tensor_storage_map = {}) {
11291165
bool use_clip_l = false;
11301166
bool use_t5 = false;
1167+
1168+
1169+
ggml_backend_t clip_l_backend, t5_backend;
1170+
if (backends.size() == 1) {
1171+
clip_l_backend = t5_backend = backends[0];
1172+
} else if (backends.size() >= 2) {
1173+
clip_l_backend = backends[0];
1174+
t5_backend = backends[1];
1175+
if (backends.size() > 2) {
1176+
LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
1177+
}
1178+
}
1179+
1180+
11311181
for (auto pair : tensor_storage_map) {
11321182
if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
11331183
use_clip_l = true;
@@ -1142,12 +1192,14 @@ struct FluxCLIPEmbedder : public Conditioner {
11421192
}
11431193

11441194
if (use_clip_l) {
1145-
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
1195+
LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend));
1196+
clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
11461197
} else {
11471198
LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
11481199
}
11491200
if (use_t5) {
1150-
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
1201+
LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(clip_l_backend));
1202+
t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
11511203
} else {
11521204
LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
11531205
}

examples/common/common.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,7 @@ struct SDContextParams {
614614
&photomaker_backend_device},
615615
{"",
616616
"--vision-backend-device",
617-
"device to use for clip-vision model (defaults to clip-backend-device)",
617+
"device to use for clip-vision model (defaults to main-backend-device)",
618618
&vision_backend_device},
619619

620620
};

0 commit comments

Comments
 (0)