22#define __CONDITIONER_HPP__
33
44#include " clip.hpp"
5+ #include " ggml-alloc.h"
6+ #include " ggml-backend.h"
57#include " llm.hpp"
68#include " t5.hpp"
9+ #include " util.h"
710
811struct SDCondition {
912 struct ggml_tensor * c_crossattn = nullptr ; // aka context
@@ -62,7 +65,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
6265 std::vector<uint8_t > token_embed_custom;
6366 std::map<std::string, std::pair<int , int >> embedding_pos_map;
6467
65- FrozenCLIPEmbedderWithCustomWords (ggml_backend_t backend ,
68+ FrozenCLIPEmbedderWithCustomWords (std::vector< ggml_backend_t > backends ,
6669 bool offload_params_to_cpu,
6770 const String2TensorStorage& tensor_storage_map,
6871 const std::map<std::string, std::string>& orig_embedding_map,
@@ -76,13 +79,27 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
7679 tokenizer.add_special_token (name);
7780 }
7881 bool force_clip_f32 = !embedding_map.empty ();
82+
83+ ggml_backend_t clip_backend = backends[0 ];
84+
7985 if (sd_version_is_sd1 (version)) {
80- text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, true , force_clip_f32);
86+ LOG_INFO (" CLIP-L: using %s backend" , ggml_backend_name (clip_backend));
87+ text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, true , force_clip_f32);
8188 } else if (sd_version_is_sd2 (version)) {
82- text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, " cond_stage_model.transformer.text_model" , OPEN_CLIP_VIT_H_14, true , force_clip_f32);
89+ LOG_INFO (" CLIP-H: using %s backend" , ggml_backend_name (clip_backend));
90+ text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, " cond_stage_model.transformer.text_model" , OPEN_CLIP_VIT_H_14, true , force_clip_f32);
8391 } else if (sd_version_is_sdxl (version)) {
84- text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, false , force_clip_f32);
85- text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, " cond_stage_model.1.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, false , force_clip_f32);
92+ ggml_backend_t clip_g_backend = clip_backend;
93+ if (backends.size () >= 2 ){
94+ clip_g_backend = backends[1 ];
95+ if (backends.size () > 2 ) {
96+ LOG_WARN (" More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest." );
97+ }
98+ }
99+ LOG_INFO (" CLIP-L: using %s backend" , ggml_backend_name (clip_backend));
100+ LOG_INFO (" CLIP-G: using %s backend" , ggml_backend_name (clip_g_backend));
101+ text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, false , force_clip_f32);
102+ text_model2 = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, " cond_stage_model.1.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, false , force_clip_f32);
86103 }
87104 }
88105
@@ -702,13 +719,29 @@ struct SD3CLIPEmbedder : public Conditioner {
702719 std::shared_ptr<CLIPTextModelRunner> clip_g;
703720 std::shared_ptr<T5Runner> t5;
704721
705- SD3CLIPEmbedder (ggml_backend_t backend ,
722+ SD3CLIPEmbedder (std::vector< ggml_backend_t > backends ,
706723 bool offload_params_to_cpu,
707724 const String2TensorStorage& tensor_storage_map = {})
708725 : clip_g_tokenizer(0 ) {
709726 bool use_clip_l = false ;
710727 bool use_clip_g = false ;
711728 bool use_t5 = false ;
729+
730+ ggml_backend_t clip_l_backend, clip_g_backend, t5_backend;
731+ if (backends.size () == 1 ) {
732+ clip_l_backend = clip_g_backend = t5_backend = backends[0 ];
733+ } else if (backends.size () == 2 ) {
734+ clip_l_backend = clip_g_backend = backends[0 ];
735+ t5_backend = backends[1 ];
736+ } else if (backends.size () >= 3 ) {
737+ clip_l_backend = backends[0 ];
738+ clip_g_backend = backends[1 ];
739+ t5_backend = backends[2 ];
740+ if (backends.size () > 3 ) {
741+ LOG_WARN (" More than 3 clip backends provided, but the model only supports 3 text encoders. Ignoring the rest." );
742+ }
743+ }
744+
712745 for (auto pair : tensor_storage_map) {
713746 if (pair.first .find (" text_encoders.clip_l" ) != std::string::npos) {
714747 use_clip_l = true ;
@@ -723,13 +756,16 @@ struct SD3CLIPEmbedder : public Conditioner {
723756 return ;
724757 }
725758 if (use_clip_l) {
726- clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, false );
759+ LOG_INFO (" CLIP-L: using %s backend" , ggml_backend_name (clip_l_backend));
760+ clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, false );
727761 }
728762 if (use_clip_g) {
729- clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, " text_encoders.clip_g.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, false );
763+ LOG_INFO (" CLIP-G: using %s backend" , ggml_backend_name (clip_g_backend));
764+ clip_g = std::make_shared<CLIPTextModelRunner>(clip_g_backend, offload_params_to_cpu, tensor_storage_map, " text_encoders.clip_g.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, false );
730765 }
731766 if (use_t5) {
732- t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, " text_encoders.t5xxl.transformer" );
767+ LOG_INFO (" T5-XXL: using %s backend" , ggml_backend_name (clip_l_backend));
768+ t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, " text_encoders.t5xxl.transformer" );
733769 }
734770 }
735771
@@ -1123,11 +1159,25 @@ struct FluxCLIPEmbedder : public Conditioner {
11231159 std::shared_ptr<T5Runner> t5;
11241160 size_t chunk_len = 256 ;
11251161
1126- FluxCLIPEmbedder (ggml_backend_t backend ,
1162+ FluxCLIPEmbedder (std::vector< ggml_backend_t > backends ,
11271163 bool offload_params_to_cpu,
11281164 const String2TensorStorage& tensor_storage_map = {}) {
11291165 bool use_clip_l = false ;
11301166 bool use_t5 = false ;
1167+
1168+
1169+ ggml_backend_t clip_l_backend, t5_backend;
1170+ if (backends.size () == 1 ) {
1171+ clip_l_backend = t5_backend = backends[0 ];
1172+ } else if (backends.size () >= 2 ) {
1173+ clip_l_backend = backends[0 ];
1174+ t5_backend = backends[1 ];
1175+ if (backends.size () > 2 ) {
1176+ LOG_WARN (" More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest." );
1177+ }
1178+ }
1179+
1180+
11311181 for (auto pair : tensor_storage_map) {
11321182 if (pair.first .find (" text_encoders.clip_l" ) != std::string::npos) {
11331183 use_clip_l = true ;
@@ -1142,12 +1192,14 @@ struct FluxCLIPEmbedder : public Conditioner {
11421192 }
11431193
11441194 if (use_clip_l) {
1145- clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, true );
1195+ LOG_INFO (" CLIP-L: using %s backend" , ggml_backend_name (clip_l_backend));
1196+ clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, true );
11461197 } else {
11471198 LOG_WARN (" clip_l text encoder not found! Prompt adherence might be degraded." );
11481199 }
11491200 if (use_t5) {
1150- t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, " text_encoders.t5xxl.transformer" );
1201+ LOG_INFO (" T5-XXL: using %s backend" , ggml_backend_name (clip_l_backend));
1202+ t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, " text_encoders.t5xxl.transformer" );
11511203 } else {
11521204 LOG_WARN (" t5xxl text encoder not found! Prompt adherence might be degraded." );
11531205 }
0 commit comments