Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

#include "ggml_extend.hpp"

#ifdef SD_USE_VULKAN
#include "ggml-vulkan.h"
#endif

class DownSampleBlock : public GGMLBlock {
protected:
int channels;
Expand Down Expand Up @@ -248,9 +252,6 @@ class FeedForward : public GGMLBlock {
float scale = 1.f;
if (precision_fix) {
scale = 1.f / 128.f;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
}
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using Vulkan without enabling force_prec_f32,
Expand All @@ -264,6 +265,11 @@ class FeedForward : public GGMLBlock {

auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
#ifdef SD_USE_VULKAN
if(ggml_backend_is_vk(ctx->backend)){
net_2->set_force_prec_f32(true);
}
#endif

x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
Expand Down
76 changes: 64 additions & 12 deletions conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
#define __CONDITIONER_HPP__

#include "clip.hpp"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "llm.hpp"
#include "t5.hpp"
#include "util.h"

struct SDCondition {
struct ggml_tensor* c_crossattn = nullptr; // aka context
Expand Down Expand Up @@ -62,7 +65,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::vector<uint8_t> token_embed_custom;
std::map<std::string, std::pair<int, int>> embedding_pos_map;

FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
FrozenCLIPEmbedderWithCustomWords(std::vector<ggml_backend_t> backends,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map,
const std::map<std::string, std::string>& orig_embedding_map,
Expand All @@ -76,13 +79,27 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
tokenizer.add_special_token(name);
}
bool force_clip_f32 = !embedding_map.empty();

ggml_backend_t clip_backend = backends[0];

if (sd_version_is_sd1(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend));
text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
} else if (sd_version_is_sd2(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
LOG_INFO("CLIP-H: using %s backend", ggml_backend_name(clip_backend));
text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
} else if (sd_version_is_sdxl(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
ggml_backend_t clip_g_backend = clip_backend;
if (backends.size() >= 2){
clip_g_backend = backends[1];
if (backends.size() > 2) {
LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
}
}
LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend));
LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend));
text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
text_model2 = std::make_shared<CLIPTextModelRunner>(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
}
}

Expand Down Expand Up @@ -702,13 +719,29 @@ struct SD3CLIPEmbedder : public Conditioner {
std::shared_ptr<CLIPTextModelRunner> clip_g;
std::shared_ptr<T5Runner> t5;

SD3CLIPEmbedder(ggml_backend_t backend,
SD3CLIPEmbedder(std::vector<ggml_backend_t> backends,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {})
: clip_g_tokenizer(0) {
bool use_clip_l = false;
bool use_clip_g = false;
bool use_t5 = false;

ggml_backend_t clip_l_backend, clip_g_backend, t5_backend;
if (backends.size() == 1) {
clip_l_backend = clip_g_backend = t5_backend = backends[0];
} else if (backends.size() == 2) {
clip_l_backend = clip_g_backend = backends[0];
t5_backend = backends[1];
} else if (backends.size() >= 3) {
clip_l_backend = backends[0];
clip_g_backend = backends[1];
t5_backend = backends[2];
if (backends.size() > 3) {
LOG_WARN("More than 3 clip backends provided, but the model only supports 3 text encoders. Ignoring the rest.");
}
}

for (auto pair : tensor_storage_map) {
if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
use_clip_l = true;
Expand All @@ -723,13 +756,16 @@ struct SD3CLIPEmbedder : public Conditioner {
return;
}
if (use_clip_l) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend));
clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
}
if (use_clip_g) {
clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend));
clip_g = std::make_shared<CLIPTextModelRunner>(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
}
if (use_t5) {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(t5_backend));
t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
}
}

Expand Down Expand Up @@ -1123,11 +1159,25 @@ struct FluxCLIPEmbedder : public Conditioner {
std::shared_ptr<T5Runner> t5;
size_t chunk_len = 256;

FluxCLIPEmbedder(ggml_backend_t backend,
FluxCLIPEmbedder(std::vector<ggml_backend_t> backends,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {}) {
bool use_clip_l = false;
bool use_t5 = false;


ggml_backend_t clip_l_backend, t5_backend;
if (backends.size() == 1) {
clip_l_backend = t5_backend = backends[0];
} else if (backends.size() >= 2) {
clip_l_backend = backends[0];
t5_backend = backends[1];
if (backends.size() > 2) {
LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
}
}


for (auto pair : tensor_storage_map) {
if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
use_clip_l = true;
Expand All @@ -1142,12 +1192,14 @@ struct FluxCLIPEmbedder : public Conditioner {
}

if (use_clip_l) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend));
clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
} else {
LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
}
if (use_t5) {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(clip_l_backend));
t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
} else {
LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
}
Expand Down
Loading
Loading