Skip to content

Commit 6600b89

Browse files
committed
Merge branch 'master' into rescue_flash_attn
2 parents cbf0489 + ea9b647 commit 6600b89

18 files changed

+1399
-200
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,12 +305,15 @@ These projects wrap `stable-diffusion.cpp` for easier use in other languages/fra
305305

306306
* Golang: [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
307307
* C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
308+
* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
309+
* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
308310

309311
## UIs
310312

311313
These projects use `stable-diffusion.cpp` as a backend for their image generation.
312314

313315
- [Jellybox](https://jellybox.com)
316+
- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
314317

315318
## Contributors
316319

clip.hpp

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,14 @@ class CLIPTokenizer {
343343
}
344344
}
345345

346+
std::string clean_up_tokenization(std::string &text){
347+
348+
std::regex pattern(R"( ,)");
349+
// Replace " ," with ","
350+
std::string result = std::regex_replace(text, pattern, ",");
351+
return result;
352+
}
353+
346354
std::string decode(const std::vector<int>& tokens) {
347355
std::string text = "";
348356
for (int t : tokens) {
@@ -351,8 +359,12 @@ class CLIPTokenizer {
351359
std::u32string ts = decoder[t];
352360
// printf("%d, %s \n", t, utf32_to_utf8(ts).c_str());
353361
std::string s = utf32_to_utf8(ts);
354-
if (s.length() >= 4 && ends_with(s, "</w>")) {
355-
text += " " + s.replace(s.length() - 4, s.length() - 1, "");
362+
if (s.length() >= 4 ){
363+
if(ends_with(s, "</w>")) {
364+
text += s.replace(s.length() - 4, s.length() - 1, "") + " ";
365+
}else{
366+
text += s;
367+
}
356368
} else {
357369
text += " " + s;
358370
}
@@ -364,6 +376,7 @@ class CLIPTokenizer {
364376

365377
// std::string s((char *)bytes.data());
366378
// std::string s = "";
379+
text = clean_up_tokenization(text);
367380
return trim(text);
368381
}
369382

@@ -711,8 +724,12 @@ class CLIPTextModel : public GGMLBlock {
711724
if (return_pooled) {
712725
auto text_projection = params["text_projection"];
713726
ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
714-
pooled = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, text_projection)), pooled);
715-
return pooled;
727+
if (text_projection != NULL) {
728+
pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);
729+
} else {
730+
LOG_DEBUG("Missing text_projection matrix, assuming identity...");
731+
}
732+
return pooled; // [hidden_size, 1, 1]
716733
}
717734

718735
return x; // [N, n_token, hidden_size]
@@ -751,7 +768,8 @@ class CLIPVisionModel : public GGMLBlock {
751768
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
752769
}
753770

754-
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
771+
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values,
772+
bool return_pooled = true) {
755773
// pixel_values: [N, num_channels, image_size, image_size]
756774
auto embeddings = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
757775
auto pre_layernorm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
@@ -761,14 +779,17 @@ class CLIPVisionModel : public GGMLBlock {
761779
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
762780
x = pre_layernorm->forward(ctx, x);
763781
x = encoder->forward(ctx, x, -1, false);
782+
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
783+
auto last_hidden_state = x;
764784
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
765785

766-
GGML_ASSERT(x->ne[3] == 1);
786+
GGML_ASSERT(x->ne[3] == 1);
767787
if (return_pooled) {
768788
ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
769789
return pooled; // [N, hidden_size]
770790
} else {
771-
return x; // [N, n_token, hidden_size]
791+
// return x; // [N, n_token, hidden_size]
792+
return last_hidden_state; // [N, n_token, hidden_size]
772793
}
773794
}
774795
};

conditioner.hpp

Lines changed: 46 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "clip.hpp"
55
#include "t5.hpp"
66

7+
78
struct SDCondition {
89
struct ggml_tensor* c_crossattn = NULL; // aka context
910
struct ggml_tensor* c_vector = NULL; // aka y
@@ -44,6 +45,7 @@ struct Conditioner {
4445
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
4546
struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
4647
SDVersion version = VERSION_SD1;
48+
PMVersion pm_version = VERSION_1;
4749
CLIPTokenizer tokenizer;
4850
ggml_type wtype;
4951
std::shared_ptr<CLIPTextModelRunner> text_model;
@@ -59,8 +61,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
5961
ggml_type wtype,
6062
const std::string& embd_dir,
6163
SDVersion version = VERSION_SD1,
64+
PMVersion pv = VERSION_1,
6265
int clip_skip = -1)
63-
: version(version), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir), wtype(wtype) {
66+
: version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir), wtype(wtype) {
6467
if (clip_skip <= 0) {
6568
clip_skip = 1;
6669
if (version == VERSION_SD2 || version == VERSION_SDXL) {
@@ -159,7 +162,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
159162
tokenize_with_trigger_token(std::string text,
160163
int num_input_imgs,
161164
int32_t image_token,
162-
bool padding = false) {
165+
bool padding = false){
163166
return tokenize_with_trigger_token(text, num_input_imgs, image_token,
164167
text_model->model.n_token, padding);
165168
}
@@ -268,7 +271,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
268271
std::vector<int> clean_input_ids_tmp;
269272
for (uint32_t i = 0; i < class_token_index[0]; i++)
270273
clean_input_ids_tmp.push_back(clean_input_ids[i]);
271-
for (uint32_t i = 0; i < num_input_imgs; i++)
274+
for (uint32_t i = 0; i < (pm_version == VERSION_2 ? 2*num_input_imgs: num_input_imgs); i++)
272275
clean_input_ids_tmp.push_back(class_token);
273276
for (uint32_t i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
274277
clean_input_ids_tmp.push_back(clean_input_ids[i]);
@@ -279,13 +282,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
279282
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
280283
weights.insert(weights.end(), clean_input_ids.size(), curr_weight);
281284
}
282-
tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
283-
weights.insert(weights.begin(), 1.0);
285+
// BUG!! double couting, pad_tokens will add BOS at the beginning
286+
// tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
287+
// weights.insert(weights.begin(), 1.0);
284288

285289
tokenizer.pad_tokens(tokens, weights, max_length, padding);
286-
290+
int offset = pm_version == VERSION_2 ? 2*num_input_imgs: num_input_imgs;
287291
for (uint32_t i = 0; i < tokens.size(); i++) {
288-
if (class_idx + 1 <= i && i < class_idx + 1 + num_input_imgs)
292+
// if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
293+
if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs
294+
// hardcode for now
289295
class_token_mask.push_back(true);
290296
else
291297
class_token_mask.push_back(false);
@@ -530,7 +536,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
530536
int height,
531537
int num_input_imgs,
532538
int adm_in_channels = -1,
533-
bool force_zero_embeddings = false) {
539+
bool force_zero_embeddings = false){
534540
auto image_tokens = convert_token_to_id(trigger_word);
535541
// if(image_tokens.size() == 1){
536542
// printf(" image token id is: %d \n", image_tokens[0]);
@@ -798,21 +804,16 @@ struct SD3CLIPEmbedder : public Conditioner {
798804
}
799805

800806
if (chunk_idx == 0) {
801-
// auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
802-
// max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
803-
// clip_l->compute(n_threads,
804-
// input_ids,
805-
// 0,
806-
// NULL,
807-
// max_token_idx,
808-
// true,
809-
// &pooled_l,
810-
// work_ctx);
811-
812-
// clip_l.transformer.text_model.text_projection no in file, ignore
813-
// TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection
814-
pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
815-
ggml_set_f32(pooled_l, 0.f);
807+
auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
808+
max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
809+
clip_l->compute(n_threads,
810+
input_ids,
811+
0,
812+
NULL,
813+
max_token_idx,
814+
true,
815+
&pooled_l,
816+
work_ctx);
816817
}
817818
}
818819

@@ -852,21 +853,16 @@ struct SD3CLIPEmbedder : public Conditioner {
852853
}
853854

854855
if (chunk_idx == 0) {
855-
// auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID);
856-
// max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
857-
// clip_g->compute(n_threads,
858-
// input_ids,
859-
// 0,
860-
// NULL,
861-
// max_token_idx,
862-
// true,
863-
// &pooled_g,
864-
// work_ctx);
865-
// clip_l.transformer.text_model.text_projection no in file, ignore pooled_g too
866-
867-
// TODO: fix pooled_g
868-
pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280);
869-
ggml_set_f32(pooled_g, 0.f);
856+
auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID);
857+
max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
858+
clip_g->compute(n_threads,
859+
input_ids,
860+
0,
861+
NULL,
862+
max_token_idx,
863+
true,
864+
&pooled_g,
865+
work_ctx);
870866
}
871867
}
872868

@@ -968,7 +964,7 @@ struct SD3CLIPEmbedder : public Conditioner {
968964
int height,
969965
int num_input_imgs,
970966
int adm_in_channels = -1,
971-
bool force_zero_embeddings = false) {
967+
bool force_zero_embeddings = false){
972968
GGML_ASSERT(0 && "Not implemented yet!");
973969
}
974970

@@ -1104,21 +1100,17 @@ struct FluxCLIPEmbedder : public Conditioner {
11041100
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
11051101
size_t max_token_idx = 0;
11061102

1107-
// auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
1108-
// max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
1109-
// clip_l->compute(n_threads,
1110-
// input_ids,
1111-
// 0,
1112-
// NULL,
1113-
// max_token_idx,
1114-
// true,
1115-
// &pooled,
1116-
// work_ctx);
1117-
1118-
// clip_l.transformer.text_model.text_projection no in file, ignore
1119-
// TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection
1120-
pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
1121-
ggml_set_f32(pooled, 0.f);
1103+
auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
1104+
max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
1105+
1106+
clip_l->compute(n_threads,
1107+
input_ids,
1108+
0,
1109+
NULL,
1110+
max_token_idx,
1111+
true,
1112+
&pooled,
1113+
work_ctx);
11221114
}
11231115

11241116
// t5

diffusion_model.hpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ struct DiffusionModel {
1717
std::vector<struct ggml_tensor*> controls = {},
1818
float control_strength = 0.f,
1919
struct ggml_tensor** output = NULL,
20-
struct ggml_context* output_ctx = NULL) = 0;
20+
struct ggml_context* output_ctx = NULL,
21+
std::vector<int> skip_layers = std::vector<int>()) = 0;
2122
virtual void alloc_params_buffer() = 0;
2223
virtual void free_params_buffer() = 0;
2324
virtual void free_compute_buffer() = 0;
@@ -71,7 +72,9 @@ struct UNetModel : public DiffusionModel {
7172
std::vector<struct ggml_tensor*> controls = {},
7273
float control_strength = 0.f,
7374
struct ggml_tensor** output = NULL,
74-
struct ggml_context* output_ctx = NULL) {
75+
struct ggml_context* output_ctx = NULL,
76+
std::vector<int> skip_layers = std::vector<int>()) {
77+
(void)skip_layers; // SLG doesn't work with UNet models
7578
return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
7679
}
7780
};
@@ -120,8 +123,9 @@ struct MMDiTModel : public DiffusionModel {
120123
std::vector<struct ggml_tensor*> controls = {},
121124
float control_strength = 0.f,
122125
struct ggml_tensor** output = NULL,
123-
struct ggml_context* output_ctx = NULL) {
124-
return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx);
126+
struct ggml_context* output_ctx = NULL,
127+
std::vector<int> skip_layers = std::vector<int>()) {
128+
return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
125129
}
126130
};
127131

@@ -170,8 +174,9 @@ struct FluxModel : public DiffusionModel {
170174
std::vector<struct ggml_tensor*> controls = {},
171175
float control_strength = 0.f,
172176
struct ggml_tensor** output = NULL,
173-
struct ggml_context* output_ctx = NULL) {
174-
return flux.compute(n_threads, x, timesteps, context, y, guidance, output, output_ctx);
177+
struct ggml_context* output_ctx = NULL,
178+
std::vector<int> skip_layers = std::vector<int>()) {
179+
return flux.compute(n_threads, x, timesteps, context, y, guidance, output, output_ctx, skip_layers);
175180
}
176181
};
177182

docs/photo_maker.md

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,26 @@ Example:
2929

3030
```bash
3131
bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors --vae ../models/sdxl_vae.safetensors --stacked-id-embd-dir ../models/photomaker-v1.safetensors --input-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0 --sampling-method euler -H 1024 -W 1024 --style-ratio 10 --vae-on-cpu -o output.png
32-
```
32+
```
33+
34+
## PhotoMaker Version 2
35+
36+
[PhotoMaker Version 2 (PMV2)](https://github.com/TencentARC/PhotoMaker/blob/main/README_pmv2.md) has some key improvements. Unfortunately it has a very heavy dependency which makes running it a bit involved in ```SD.cpp```.
37+
38+
Running PMV2 is now a two-step process:
39+
40+
- Run a python script ```face_detect.py``` to obtain **id_embeds** for the given input images
41+
```
42+
python face_detect.py input_image_dir
43+
```
44+
An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```
45+
46+
**Note: this step is only needed to run once; the same ```id_embeds``` can be reused**
47+
48+
- Run the same command as in version 1 but replacing ```photomaker-v1.safetensors``` with ```photomaker-v2.safetensors```.
49+
50+
You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)
51+
52+
- All the command line parameters from Version 1 remain the same for Version 2
53+
54+

0 commit comments

Comments
 (0)