leejet
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎clip.hpp‎
Lines changed: 28 additions & 7 deletions b/‎clip.hpp‎
Lines changed: 28 additions & 7 deletions
diff --git a/‎conditioner.hpp‎
Lines changed: 46 additions & 54 deletions b/‎conditioner.hpp‎
Lines changed: 46 additions & 54 deletions
diff --git a/‎diffusion_model.hpp‎
Lines changed: 11 additions & 6 deletions b/‎diffusion_model.hpp‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎docs/photo_maker.md‎
Lines changed: 23 additions & 1 deletion b/‎docs/photo_maker.md‎
Lines changed: 23 additions & 1 deletion
@@ -305,12 +305,15 @@ These projects wrap `stable-diffusion.cpp` for easier use in other languages/fra
 
 * Golang: [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
 * C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
+* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
+* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
 
 ## UIs
 
 These projects use `stable-diffusion.cpp` as a backend for their image generation.
 
 - [Jellybox](https://jellybox.com)
+- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
 
 ## Contributors
 
 
@@ -343,6 +343,14 @@ class CLIPTokenizer {
         }
     }
 
+    std::string clean_up_tokenization(std::string &text){
+
+        std::regex pattern(R"( ,)");
+        // Replace " ," with ","
+        std::string result = std::regex_replace(text, pattern, ",");
+        return result;
+    }
+
     std::string decode(const std::vector<int>& tokens) {
         std::string text = "";
         for (int t : tokens) {
@@ -351,8 +359,12 @@ class CLIPTokenizer {
             std::u32string ts = decoder[t];
             // printf("%d, %s \n", t,  utf32_to_utf8(ts).c_str());
             std::string s = utf32_to_utf8(ts);
-            if (s.length() >= 4 && ends_with(s, "</w>")) {
-                text += " " + s.replace(s.length() - 4, s.length() - 1, "");
+            if (s.length() >= 4 ){
+                if(ends_with(s, "</w>")) {
+                    text += s.replace(s.length() - 4, s.length() - 1, "") + " ";
+                }else{
+                    text += s;
+                }
             } else {
                 text += " " + s;
             }
@@ -364,6 +376,7 @@ class CLIPTokenizer {
 
         // std::string s((char *)bytes.data());
         // std::string s = "";
+        text = clean_up_tokenization(text);
         return trim(text);
     }
 
@@ -711,8 +724,12 @@ class CLIPTextModel : public GGMLBlock {
         if (return_pooled) {
             auto text_projection = params["text_projection"];
             ggml_tensor* pooled  = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
-            pooled               = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, text_projection)), pooled);
-            return pooled;
+            if (text_projection != NULL) {
+                pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);
+            } else {
+                LOG_DEBUG("Missing text_projection matrix, assuming identity...");
+            }
+            return pooled;  // [hidden_size, 1, 1]
         }
 
         return x;  // [N, n_token, hidden_size]
@@ -751,7 +768,8 @@ class CLIPVisionModel : public GGMLBlock {
         blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, 
+                                bool return_pooled = true) {
         // pixel_values: [N, num_channels, image_size, image_size]
         auto embeddings     = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
         auto pre_layernorm  = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
@@ -761,14 +779,17 @@ class CLIPVisionModel : public GGMLBlock {
         auto x = embeddings->forward(ctx, pixel_values);  // [N, num_positions, embed_dim]
         x      = pre_layernorm->forward(ctx, x);
         x      = encoder->forward(ctx, x, -1, false);
+        // print_ggml_tensor(x, true, "ClipVisionModel x: ");  
+        auto last_hidden_state = x;
         x      = post_layernorm->forward(ctx, x);  // [N, n_token, hidden_size]
 
-        GGML_ASSERT(x->ne[3] == 1);
+        GGML_ASSERT(x->ne[3] == 1);        
         if (return_pooled) {
             ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
             return pooled;  // [N, hidden_size]
         } else {
-            return x;  // [N, n_token, hidden_size]
+            // return x;  // [N, n_token, hidden_size]
+            return last_hidden_state;  // [N, n_token, hidden_size]
         }
     }
 };
 
@@ -4,6 +4,7 @@
 #include "clip.hpp"
 #include "t5.hpp"
 
+
 struct SDCondition {
     struct ggml_tensor* c_crossattn = NULL;  // aka context
     struct ggml_tensor* c_vector    = NULL;  // aka y
@@ -44,6 +45,7 @@ struct Conditioner {
 // Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
 struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     SDVersion version = VERSION_SD1;
+    PMVersion pm_version = VERSION_1;
     CLIPTokenizer tokenizer;
     ggml_type wtype;
     std::shared_ptr<CLIPTextModelRunner> text_model;
@@ -59,8 +61,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       ggml_type wtype,
                                       const std::string& embd_dir,
                                       SDVersion version = VERSION_SD1,
+                                      PMVersion pv = VERSION_1,
                                       int clip_skip     = -1)
-        : version(version), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir), wtype(wtype) {
+        : version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir), wtype(wtype) {
         if (clip_skip <= 0) {
             clip_skip = 1;
             if (version == VERSION_SD2 || version == VERSION_SDXL) {
@@ -159,7 +162,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     tokenize_with_trigger_token(std::string text,
                                 int num_input_imgs,
                                 int32_t image_token,
-                                bool padding = false) {
+                                bool padding = false){
         return tokenize_with_trigger_token(text, num_input_imgs, image_token,
                                            text_model->model.n_token, padding);
     }
@@ -268,7 +271,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                 std::vector<int> clean_input_ids_tmp;
                 for (uint32_t i = 0; i < class_token_index[0]; i++)
                     clean_input_ids_tmp.push_back(clean_input_ids[i]);
-                for (uint32_t i = 0; i < num_input_imgs; i++)
+                for (uint32_t i = 0; i < (pm_version == VERSION_2 ? 2*num_input_imgs: num_input_imgs); i++)
                     clean_input_ids_tmp.push_back(class_token);
                 for (uint32_t i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
                     clean_input_ids_tmp.push_back(clean_input_ids[i]);
@@ -279,13 +282,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
             weights.insert(weights.end(), clean_input_ids.size(), curr_weight);
         }
-        tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
-        weights.insert(weights.begin(), 1.0);
+        // BUG!! double couting, pad_tokens will add BOS at the beginning
+        // tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
+        // weights.insert(weights.begin(), 1.0);
 
         tokenizer.pad_tokens(tokens, weights, max_length, padding);
-
+        int offset = pm_version == VERSION_2 ? 2*num_input_imgs: num_input_imgs;
         for (uint32_t i = 0; i < tokens.size(); i++) {
-            if (class_idx + 1 <= i && i < class_idx + 1 + num_input_imgs)
+            // if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
+            if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs
+                                                                            // hardcode for now   
                 class_token_mask.push_back(true);
             else
                 class_token_mask.push_back(false);
@@ -530,7 +536,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                        int height,
                                        int num_input_imgs,
                                        int adm_in_channels        = -1,
-                                       bool force_zero_embeddings = false) {
+                                       bool force_zero_embeddings = false){
         auto image_tokens = convert_token_to_id(trigger_word);
         // if(image_tokens.size() == 1){
         //     printf(" image token id is: %d \n", image_tokens[0]);
@@ -798,21 +804,16 @@ struct SD3CLIPEmbedder : public Conditioner {
                 }
 
                 if (chunk_idx == 0) {
-                    // auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
-                    // max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
-                    // clip_l->compute(n_threads,
-                    //                 input_ids,
-                    //                 0,
-                    //                 NULL,
-                    //                 max_token_idx,
-                    //                 true,
-                    //                 &pooled_l,
-                    //                 work_ctx);
-
-                    // clip_l.transformer.text_model.text_projection no in file, ignore
-                    // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection
-                    pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
-                    ggml_set_f32(pooled_l, 0.f);
+                    auto it       = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
+                    max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+                    clip_l->compute(n_threads,
+                                    input_ids,
+                                    0,
+                                    NULL,
+                                    max_token_idx,
+                                    true,
+                                    &pooled_l,
+                                    work_ctx);
                 }
             }
 
@@ -852,21 +853,16 @@ struct SD3CLIPEmbedder : public Conditioner {
                 }
 
                 if (chunk_idx == 0) {
-                    // auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID);
-                    // max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
-                    // clip_g->compute(n_threads,
-                    //                 input_ids,
-                    //                 0,
-                    //                 NULL,
-                    //                 max_token_idx,
-                    //                 true,
-                    //                 &pooled_g,
-                    //                 work_ctx);
-                    // clip_l.transformer.text_model.text_projection no in file, ignore pooled_g too
-
-                    // TODO: fix pooled_g
-                    pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280);
-                    ggml_set_f32(pooled_g, 0.f);
+                    auto it       = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID);
+                    max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+                    clip_g->compute(n_threads,
+                                    input_ids,
+                                    0,
+                                    NULL,
+                                    max_token_idx,
+                                    true,
+                                    &pooled_g,
+                                    work_ctx);
                 }
             }
 
@@ -968,7 +964,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                                                                   int height,
                                                                                   int num_input_imgs,
                                                                                   int adm_in_channels        = -1,
-                                                                                  bool force_zero_embeddings = false) {
+                                                                                  bool force_zero_embeddings = false){
         GGML_ASSERT(0 && "Not implemented yet!");
     }
 
@@ -1104,21 +1100,17 @@ struct FluxCLIPEmbedder : public Conditioner {
                 auto input_ids       = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
                 size_t max_token_idx = 0;
 
-                // auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
-                // max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
-                // clip_l->compute(n_threads,
-                //                 input_ids,
-                //                 0,
-                //                 NULL,
-                //                 max_token_idx,
-                //                 true,
-                //                 &pooled,
-                //                 work_ctx);
-
-                // clip_l.transformer.text_model.text_projection no in file, ignore
-                // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection
-                pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
-                ggml_set_f32(pooled, 0.f);
+                auto it       = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
+                max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+
+                clip_l->compute(n_threads,
+                                input_ids,
+                                0,
+                                NULL,
+                                max_token_idx,
+                                true,
+                                &pooled,
+                                work_ctx);
             }
 
             // t5
 
@@ -17,7 +17,8 @@ struct DiffusionModel {
                          std::vector<struct ggml_tensor*> controls = {},
                          float control_strength                    = 0.f,
                          struct ggml_tensor** output               = NULL,
-                         struct ggml_context* output_ctx           = NULL)                        = 0;
+                         struct ggml_context* output_ctx           = NULL,
+                         std::vector<int> skip_layers              = std::vector<int>())             = 0;
     virtual void alloc_params_buffer()                                                  = 0;
     virtual void free_params_buffer()                                                   = 0;
     virtual void free_compute_buffer()                                                  = 0;
@@ -71,7 +72,9 @@ struct UNetModel : public DiffusionModel {
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
                  struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL) {
+                 struct ggml_context* output_ctx           = NULL,
+                 std::vector<int> skip_layers              = std::vector<int>()) {
+        (void)skip_layers;  // SLG doesn't work with UNet models
         return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
     }
 };
@@ -120,8 +123,9 @@ struct MMDiTModel : public DiffusionModel {
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
                  struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL) {
-        return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx);
+                 struct ggml_context* output_ctx           = NULL,
+                 std::vector<int> skip_layers              = std::vector<int>()) {
+        return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
     }
 };
 
@@ -170,8 +174,9 @@ struct FluxModel : public DiffusionModel {
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
                  struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL) {
-        return flux.compute(n_threads, x, timesteps, context, y, guidance, output, output_ctx);
+                 struct ggml_context* output_ctx           = NULL,
+                 std::vector<int> skip_layers              = std::vector<int>()) {
+        return flux.compute(n_threads, x, timesteps, context, y, guidance, output, output_ctx, skip_layers);
     }
 };
 
 
@@ -29,4 +29,26 @@ Example:
 
 ```bash
 bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors  --vae ../models/sdxl_vae.safetensors --stacked-id-embd-dir ../models/photomaker-v1.safetensors --input-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0  --sampling-method euler -H 1024 -W 1024 --style-ratio 10 --vae-on-cpu -o output.png
-```
+```
+
+## PhotoMaker Version 2
+
+[PhotoMaker Version 2 (PMV2)](https://github.com/TencentARC/PhotoMaker/blob/main/README_pmv2.md) has some key improvements. Unfortunately it has a very heavy dependency which makes running it a bit involved in ```SD.cpp```. 
+
+Running PMV2 is now a two-step process:
+
+- Run a python script ```face_detect.py``` to obtain **id_embeds** for the given input images
+```
+python face_detect.py input_image_dir
+```
+An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```
+
+**Note: this step is only needed to run once; the same ```id_embeds``` can be reused**
+
+- Run the same command as in version 1 but replacing ```photomaker-v1.safetensors``` with ```photomaker-v2.safetensors```.
+
+  You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)
+
+- All the command line parameters from Version 1 remain the same for Version 2
+
+