leejet
diff --git a/‎clip.hpp‎
Lines changed: 8 additions & 10 deletions b/‎clip.hpp‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎common.hpp‎
Lines changed: 1 addition & 1 deletion b/‎common.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎conditioner.hpp‎
Lines changed: 9 additions & 10 deletions b/‎conditioner.hpp‎
Lines changed: 9 additions & 10 deletions
diff --git a/‎diffusion_model.hpp‎
Lines changed: 2 additions & 2 deletions b/‎diffusion_model.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/cli/main.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/cli/main.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎flux.hpp‎
Lines changed: 6 additions & 5 deletions b/‎flux.hpp‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎ggml_extend.hpp‎
Lines changed: 14 additions & 15 deletions b/‎ggml_extend.hpp‎
Lines changed: 14 additions & 15 deletions
diff --git a/‎model.cpp‎
Lines changed: 13 additions & 15 deletions b/‎model.cpp‎
Lines changed: 13 additions & 15 deletions
diff --git a/‎model.h‎
Lines changed: 1 addition & 1 deletion b/‎model.h‎
Lines changed: 1 addition & 1 deletion
@@ -343,8 +343,7 @@ class CLIPTokenizer {
         }
     }
 
-    std::string clean_up_tokenization(std::string &text){
-
+    std::string clean_up_tokenization(std::string& text) {
         std::regex pattern(R"( ,)");
         // Replace " ," with ","
         std::string result = std::regex_replace(text, pattern, ",");
@@ -359,10 +358,10 @@ class CLIPTokenizer {
             std::u32string ts = decoder[t];
             // printf("%d, %s \n", t,  utf32_to_utf8(ts).c_str());
             std::string s = utf32_to_utf8(ts);
-            if (s.length() >= 4 ){
-                if(ends_with(s, "</w>")) {
+            if (s.length() >= 4) {
+                if (ends_with(s, "</w>")) {
                     text += s.replace(s.length() - 4, s.length() - 1, "") + " ";
-                }else{
+                } else {
                     text += s;
                 }
             } else {
@@ -768,8 +767,7 @@ class CLIPVisionModel : public GGMLBlock {
         blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, 
-                                bool return_pooled = true) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
         // pixel_values: [N, num_channels, image_size, image_size]
         auto embeddings     = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
         auto pre_layernorm  = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
@@ -779,11 +777,11 @@ class CLIPVisionModel : public GGMLBlock {
         auto x = embeddings->forward(ctx, pixel_values);  // [N, num_positions, embed_dim]
         x      = pre_layernorm->forward(ctx, x);
         x      = encoder->forward(ctx, x, -1, false);
-        // print_ggml_tensor(x, true, "ClipVisionModel x: ");  
+        // print_ggml_tensor(x, true, "ClipVisionModel x: ");
         auto last_hidden_state = x;
-        x      = post_layernorm->forward(ctx, x);  // [N, n_token, hidden_size]
+        x                      = post_layernorm->forward(ctx, x);  // [N, n_token, hidden_size]
 
-        GGML_ASSERT(x->ne[3] == 1);        
+        GGML_ASSERT(x->ne[3] == 1);
         if (return_pooled) {
             ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
             return pooled;  // [N, hidden_size]
 
@@ -304,7 +304,7 @@ class BasicTransformerBlock : public GGMLBlock {
                           int64_t n_head,
                           int64_t d_head,
                           int64_t context_dim,
-                          bool ff_in = false,
+                          bool ff_in      = false,
                           bool flash_attn = false)
         : n_head(n_head), d_head(d_head), ff_in(ff_in) {
         // disable_self_attn is always False
 
@@ -4,7 +4,6 @@
 #include "clip.hpp"
 #include "t5.hpp"
 
-
 struct SDCondition {
     struct ggml_tensor* c_crossattn = NULL;  // aka context
     struct ggml_tensor* c_vector    = NULL;  // aka y
@@ -44,7 +43,7 @@ struct Conditioner {
 // ldm.modules.encoders.modules.FrozenCLIPEmbedder
 // Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
 struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
-    SDVersion version = VERSION_SD1;
+    SDVersion version    = VERSION_SD1;
     PMVersion pm_version = VERSION_1;
     CLIPTokenizer tokenizer;
     ggml_type wtype;
@@ -61,7 +60,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       ggml_type wtype,
                                       const std::string& embd_dir,
                                       SDVersion version = VERSION_SD1,
-                                      PMVersion pv = VERSION_1,
+                                      PMVersion pv      = VERSION_1,
                                       int clip_skip     = -1)
         : version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir), wtype(wtype) {
         if (clip_skip <= 0) {
@@ -162,7 +161,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     tokenize_with_trigger_token(std::string text,
                                 int num_input_imgs,
                                 int32_t image_token,
-                                bool padding = false){
+                                bool padding = false) {
         return tokenize_with_trigger_token(text, num_input_imgs, image_token,
                                            text_model->model.n_token, padding);
     }
@@ -271,7 +270,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                 std::vector<int> clean_input_ids_tmp;
                 for (uint32_t i = 0; i < class_token_index[0]; i++)
                     clean_input_ids_tmp.push_back(clean_input_ids[i]);
-                for (uint32_t i = 0; i < (pm_version == VERSION_2 ? 2*num_input_imgs: num_input_imgs); i++)
+                for (uint32_t i = 0; i < (pm_version == VERSION_2 ? 2 * num_input_imgs : num_input_imgs); i++)
                     clean_input_ids_tmp.push_back(class_token);
                 for (uint32_t i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
                     clean_input_ids_tmp.push_back(clean_input_ids[i]);
@@ -287,11 +286,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         // weights.insert(weights.begin(), 1.0);
 
         tokenizer.pad_tokens(tokens, weights, max_length, padding);
-        int offset = pm_version == VERSION_2 ? 2*num_input_imgs: num_input_imgs;
+        int offset = pm_version == VERSION_2 ? 2 * num_input_imgs : num_input_imgs;
         for (uint32_t i = 0; i < tokens.size(); i++) {
             // if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
-            if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs
-                                                                            // hardcode for now   
+            if (class_idx + 1 <= i && i < class_idx + 1 + offset)  // photomaker V2 has num_tokens(=2)*num_input_imgs
+                                                                   // hardcode for now
                 class_token_mask.push_back(true);
             else
                 class_token_mask.push_back(false);
@@ -536,7 +535,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                        int height,
                                        int num_input_imgs,
                                        int adm_in_channels        = -1,
-                                       bool force_zero_embeddings = false){
+                                       bool force_zero_embeddings = false) {
         auto image_tokens = convert_token_to_id(trigger_word);
         // if(image_tokens.size() == 1){
         //     printf(" image token id is: %d \n", image_tokens[0]);
@@ -964,7 +963,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                                                                   int height,
                                                                                   int num_input_imgs,
                                                                                   int adm_in_channels        = -1,
-                                                                                  bool force_zero_embeddings = false){
+                                                                                  bool force_zero_embeddings = false) {
         GGML_ASSERT(0 && "Not implemented yet!");
     }
 
 
@@ -33,7 +33,7 @@ struct UNetModel : public DiffusionModel {
     UNetModel(ggml_backend_t backend,
               ggml_type wtype,
               SDVersion version = VERSION_SD1,
-              bool flash_attn = false)
+              bool flash_attn   = false)
         : unet(backend, wtype, version, flash_attn) {
     }
 
@@ -135,7 +135,7 @@ struct FluxModel : public DiffusionModel {
     FluxModel(ggml_backend_t backend,
               ggml_type wtype,
               SDVersion version = VERSION_FLUX_DEV,
-              bool flash_attn = false)
+              bool flash_attn   = false)
         : flux(backend, wtype, version, flash_attn) {
     }
 
 
@@ -483,7 +483,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         } else if (arg == "--vae-on-cpu") {
             params.vae_on_cpu = true;  // will slow down latent decoding but necessary for low MEM GPUs
         } else if (arg == "--diffusion-fa") {
-            params.diffusion_flash_attn = true; // can reduce MEM significantly
+            params.diffusion_flash_attn = true;  // can reduce MEM significantly
         } else if (arg == "--canny") {
             params.canny_preprocess = true;
         } else if (arg == "-b" || arg == "--batch-count") {
 
@@ -170,9 +170,9 @@ namespace Flux {
             // x: [N, n_token, dim]
             // pe: [n_token, d_head/2, 2, 2]
             // return [N, n_token, dim]
-            auto qkv = pre_attention(ctx, x);                       // q,k,v: [N, n_token, n_head, d_head]
+            auto qkv = pre_attention(ctx, x);                                   // q,k,v: [N, n_token, n_head, d_head]
             x        = attention(ctx, qkv[0], qkv[1], qkv[2], pe, flash_attn);  // [N, n_token, dim]
-            x        = post_attention(ctx, x);                      // [N, n_token, dim]
+            x        = post_attention(ctx, x);                                  // [N, n_token, dim]
             return x;
         }
     };
@@ -241,11 +241,12 @@ namespace Flux {
 
     struct DoubleStreamBlock : public GGMLBlock {
         bool flash_attn;
+
     public:
         DoubleStreamBlock(int64_t hidden_size,
                           int64_t num_heads,
                           float mlp_ratio,
-                          bool qkv_bias = false,
+                          bool qkv_bias   = false,
                           bool flash_attn = false)
             : flash_attn(flash_attn) {
             int64_t mlp_hidden_dim = hidden_size * mlp_ratio;
@@ -322,7 +323,7 @@ namespace Flux {
             auto k = ggml_concat(ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
             auto v = ggml_concat(ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
 
-            auto attn         = attention(ctx, q, k, v, pe, flash_attn);                          // [N, n_txt_token + n_img_token, n_head*d_head]
+            auto attn         = attention(ctx, q, k, v, pe, flash_attn);              // [N, n_txt_token + n_img_token, n_head*d_head]
             attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
             auto txt_attn_out = ggml_view_3d(ctx,
                                              attn,
@@ -830,7 +831,7 @@ namespace Flux {
         FluxRunner(ggml_backend_t backend,
                    ggml_type wtype,
                    SDVersion version = VERSION_FLUX_DEV,
-                   bool flash_attn = false)
+                   bool flash_attn   = false)
             : GGMLRunner(backend, wtype) {
             flux_params.flash_attn = flash_attn;
             if (version == VERSION_FLUX_SCHNELL) {
 
@@ -709,18 +709,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
 
     float scale = (1.0f / sqrt((float)d_head));
 
-    //if (flash_attn) {
-    //    LOG_DEBUG("attention_ext L_q:%d L_k:%d n_head:%d C:%d d_head:%d N:%d", L_q, L_k, n_head, C, d_head, N);
-    //}
-    // is there anything oddly shaped?? ping Green-Sky if you can trip this assert
+    // if (flash_attn) {
+    //     LOG_DEBUG("attention_ext L_q:%d L_k:%d n_head:%d C:%d d_head:%d N:%d", L_q, L_k, n_head, C, d_head, N);
+    // }
+    //  is there anything oddly shaped?? ping Green-Sky if you can trip this assert
     GGML_ASSERT(((L_k % 256 == 0) && L_q == L_k) || !(L_k % 256 == 0));
 
     bool can_use_flash_attn = true;
-    can_use_flash_attn = can_use_flash_attn && L_k % 256 == 0;
-    can_use_flash_attn = can_use_flash_attn && d_head % 64 == 0; // double check
+    can_use_flash_attn      = can_use_flash_attn && L_k % 256 == 0;
+    can_use_flash_attn      = can_use_flash_attn && d_head % 64 == 0;  // double check
 
     // cuda max d_head seems to be 256, cpu does seem to work with 512
-    can_use_flash_attn = can_use_flash_attn && d_head <= 256; // double check
+    can_use_flash_attn = can_use_flash_attn && d_head <= 256;  // double check
 
     if (mask != nullptr) {
         // TODO(Green-Sky): figure out if we can bend t5 to work too
@@ -731,9 +731,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
     // TODO(Green-Sky): more pad or disable for funny tensor shapes
 
     ggml_tensor* kqv = nullptr;
-    //GGML_ASSERT((flash_attn && can_use_flash_attn) || !flash_attn);
+    // GGML_ASSERT((flash_attn && can_use_flash_attn) || !flash_attn);
     if (can_use_flash_attn && flash_attn) {
-        //LOG_DEBUG("using flash attention");
+        // LOG_DEBUG("using flash attention");
         k = ggml_cast(ctx, k, GGML_TYPE_F16);
 
         v = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3));  // [N, n_head, L_k, d_head]
@@ -743,7 +743,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
         kqv = ggml_flash_attn_ext(ctx, q, k, v, mask, scale, 0, 0);
         ggml_flash_attn_ext_set_prec(kqv, GGML_PREC_F32);
 
-        //kqv = ggml_view_3d(ctx, kqv, d_head, n_head, L_k, kqv->nb[1], kqv->nb[2], 0);
+        // kqv = ggml_view_3d(ctx, kqv, d_head, n_head, L_k, kqv->nb[1], kqv->nb[2], 0);
         kqv = ggml_view_3d(ctx, kqv, d_head, n_head, L_q, kqv->nb[1], kqv->nb[2], 0);
     } else {
         v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));  // [N, n_head, d_head, L_k]
@@ -761,8 +761,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
 
         kqv = ggml_mul_mat(ctx, v, kq);  // [N * n_head, L_q, d_head]
 
-        kqv = ggml_reshape_4d(ctx, kqv, d_head, L_q, n_head, N);   // [N, n_head, L_q, d_head]
-        kqv = ggml_permute(ctx, kqv, 0, 2, 1, 3);  // [N, L_q, n_head, d_head]
+        kqv = ggml_reshape_4d(ctx, kqv, d_head, L_q, n_head, N);  // [N, n_head, L_q, d_head]
+        kqv = ggml_permute(ctx, kqv, 0, 2, 1, 3);                 // [N, L_q, n_head, d_head]
     }
 
     kqv = ggml_cont(ctx, kqv);
@@ -1057,7 +1057,7 @@ struct GGMLRunner {
         //           get_desc().c_str(),
         //           params_buffer_size / (1024.0 * 1024.0),
         //           ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
-        //           num_tensors);          
+        //           num_tensors);
         return true;
     }
 
@@ -1227,8 +1227,7 @@ class Linear : public UnaryBlock {
         params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
         if (bias) {
             params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_features);
-        }       
-
+        }
     }
 
 public:
 
@@ -148,19 +148,19 @@ std::unordered_map<std::string, std::string> vae_decoder_name_map = {
 
 std::unordered_map<std::string, std::string> pmid_v2_name_map = {
     {"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.weight",
-    "pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc1.weight"},
+     "pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc1.weight"},
     {"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.3.weight",
-    "pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc2.weight"},
+     "pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc2.weight"},
     {"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.weight",
-    "pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc1.weight"},
+     "pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc1.weight"},
     {"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.3.weight",
      "pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc2.weight"},
     {"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.weight",
-    "pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc1.weight"},
+     "pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc1.weight"},
     {"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.3.weight",
-    "pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc2.weight"},
+     "pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc2.weight"},
     {"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.weight",
-    "pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc1.weight"},
+     "pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc1.weight"},
     {"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.3.weight",
      "pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc2.weight"},
     {"pmid.qformer_perceiver.token_proj.0.bias",
@@ -650,33 +650,32 @@ uint16_t f8_e4m3_to_f16(uint8_t f8) {
     return ggml_fp32_to_fp16(*reinterpret_cast<const float*>(&result));
 }
 
-
 uint16_t f8_e5m2_to_f16(uint8_t fp8) {
-    uint8_t sign = (fp8 >> 7) & 0x1;
+    uint8_t sign     = (fp8 >> 7) & 0x1;
     uint8_t exponent = (fp8 >> 2) & 0x1F;
     uint8_t mantissa = fp8 & 0x3;
 
     uint16_t fp16_sign = sign << 15;
     uint16_t fp16_exponent;
     uint16_t fp16_mantissa;
 
-    if (exponent == 0 && mantissa == 0) { //zero
+    if (exponent == 0 && mantissa == 0) {  // zero
         return fp16_sign;
     }
 
-    if (exponent == 0x1F) { //NAN and INF
+    if (exponent == 0x1F) {  // NAN and INF
         fp16_exponent = 0x1F;
         fp16_mantissa = mantissa ? (mantissa << 8) : 0;
         return fp16_sign | (fp16_exponent << 10) | fp16_mantissa;
     }
 
-    if (exponent == 0) { //subnormal numbers
+    if (exponent == 0) {  // subnormal numbers
         fp16_exponent = 0;
         fp16_mantissa = (mantissa << 8);
         return fp16_sign | fp16_mantissa;
     }
 
-    //normal numbers
+    // normal numbers
     int16_t true_exponent = (int16_t)exponent - 15 + 15;
     if (true_exponent <= 0) {
         fp16_exponent = 0;
@@ -1051,7 +1050,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
         }
 
         TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin);
-        tensor_storage.reverse_ne();        
+        tensor_storage.reverse_ne();
 
         size_t tensor_data_size = end - begin;
 
@@ -1434,10 +1433,9 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
             std::string name = zip_entry_name(zip);
             size_t pos       = name.find("data.pkl");
             if (pos != std::string::npos) {
-
                 std::string dir = name.substr(0, pos);
                 printf("ZIP %d, name = %s, dir = %s \n", i, name.c_str(), dir.c_str());
-                void* pkl_data  = NULL;
+                void* pkl_data = NULL;
                 size_t pkl_size;
                 zip_entry_read(zip, &pkl_data, &pkl_size);
 
 
@@ -167,7 +167,7 @@ class ModelLoader {
     bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                       ggml_backend_t backend,
                       std::set<std::string> ignore_tensors = {});
-                      
+
     bool save_to_gguf_file(const std::string& file_path, ggml_type type);
     bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
     int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);