@@ -1648,6 +1648,19 @@ struct LLMEmbedder : public Conditioner {
16481648 }
16491649 }
16501650
1651+ size_t get_utf8_char_len (char c) {
1652+ unsigned char uc = static_cast <unsigned char >(c);
1653+ if ((uc & 0x80 ) == 0 )
1654+ return 1 ; // ASCII (1 byte)
1655+ if ((uc & 0xE0 ) == 0xC0 )
1656+ return 2 ; // 2-byte char
1657+ if ((uc & 0xF0 ) == 0xE0 )
1658+ return 3 ; // 3-byte char (Common for Chinese/Japanese)
1659+ if ((uc & 0xF8 ) == 0xF0 )
1660+ return 4 ; // 4-byte char (Emojis, etc.)
1661+ return 1 ; // Fallback (should not happen in valid UTF-8)
1662+ }
1663+
16511664 std::tuple<std::vector<int >, std::vector<float >> tokenize (
16521665 std::string text,
16531666 std::pair<int , int > attn_range,
@@ -1667,16 +1680,6 @@ struct LLMEmbedder : public Conditioner {
16671680 }
16681681 parsed_attention.emplace_back (text.substr (attn_range.second ), 1 .f );
16691682
1670- // {
1671- // std::stringstream ss;
1672- // ss << '[';
1673- // for (const auto& item : parsed_attention) {
1674- // ss << "['" << item.first << "', " << item.second << "], ";
1675- // }
1676- // ss << ']';
1677- // LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
1678- // }
1679-
16801683 std::vector<int > tokens;
16811684 std::vector<float > weights;
16821685
@@ -1685,46 +1688,47 @@ struct LLMEmbedder : public Conditioner {
16851688 float curr_weight = item.second ;
16861689
16871690 if (spell_quotes) {
1688- std::vector<std:: string> parts ;
1691+ std::string buffer ;
16891692 bool in_quote = false ;
1690- std::string current_part;
16911693
1692- for (char c : curr_text) {
1693- if (c == ' "' ) {
1694- if (!current_part.empty ()) {
1695- parts.push_back (current_part);
1696- current_part.clear ();
1694+ size_t i = 0 ;
1695+ while (i < curr_text.size ()) {
1696+ // utf8 character can be 1-4 char
1697+ size_t char_len = get_utf8_char_len (curr_text[i]);
1698+
1699+ // Safety check to prevent reading past end of string
1700+ if (i + char_len > curr_text.size ()) {
1701+ char_len = curr_text.size () - i;
1702+ }
1703+ std::string uchar = curr_text.substr (i, char_len);
1704+ i += char_len;
1705+
1706+ if (uchar == " \" " ) {
1707+ buffer += uchar;
1708+ // If we were accumulating normal text, flush it now
1709+ if (!in_quote) {
1710+ std::vector<int > part_tokens = tokenizer->tokenize (buffer, nullptr );
1711+ tokens.insert (tokens.end (), part_tokens.begin (), part_tokens.end ());
1712+ weights.insert (weights.end (), part_tokens.size (), curr_weight);
1713+ buffer.clear ();
16971714 }
16981715 in_quote = !in_quote;
16991716 } else {
1700- current_part += c;
1701- if (in_quote && current_part.size () == 1 ) {
1702- parts.push_back (current_part);
1703- current_part.clear ();
1704- }
1705- }
1706- }
1707- if (!current_part.empty ()) {
1708- parts.push_back (current_part);
1709- }
1710-
1711- for (const auto & part : parts) {
1712- if (part.empty ())
1713- continue ;
1714- if (part[0 ] == ' "' && part.back () == ' "' ) {
1715- std::string quoted_content = part.substr (1 , part.size () - 2 );
1716- for (char ch : quoted_content) {
1717- std::string char_str (1 , ch);
1718- std::vector<int > char_tokens = tokenizer->tokenize (char_str, nullptr );
1717+ if (in_quote) {
1718+ std::vector<int > char_tokens = tokenizer->tokenize (uchar, nullptr );
17191719 tokens.insert (tokens.end (), char_tokens.begin (), char_tokens.end ());
17201720 weights.insert (weights.end (), char_tokens.size (), curr_weight);
1721+ } else {
1722+ buffer += uchar;
17211723 }
1722- } else {
1723- std::vector<int > part_tokens = tokenizer->tokenize (part, nullptr );
1724- tokens.insert (tokens.end (), part_tokens.begin (), part_tokens.end ());
1725- weights.insert (weights.end (), part_tokens.size (), curr_weight);
17261724 }
17271725 }
1726+
1727+ if (!buffer.empty ()) {
1728+ std::vector<int > part_tokens = tokenizer->tokenize (buffer, nullptr );
1729+ tokens.insert (tokens.end (), part_tokens.begin (), part_tokens.end ());
1730+ weights.insert (weights.end (), part_tokens.size (), curr_weight);
1731+ }
17281732 } else {
17291733 std::vector<int > curr_tokens = tokenizer->tokenize (curr_text, nullptr );
17301734 tokens.insert (tokens.end (), curr_tokens.begin (), curr_tokens.end ());
@@ -1751,14 +1755,13 @@ struct LLMEmbedder : public Conditioner {
17511755 LOG_INFO (" LongCatEditPipeline" );
17521756 prompt_template_encode_start_idx = 67 ;
17531757 // prompt_template_encode_end_idx = 5;
1754- int image_embed_idx = 36 + 6 ;
1758+ int image_embed_idx = 36 + 6 ;
17551759
17561760 int min_pixels = 384 * 384 ;
17571761 int max_pixels = 560 * 560 ;
17581762 std::string placeholder = " <|image_pad|>" ;
17591763 std::string img_prompt;
17601764
1761-
17621765 // Only one image is officicially supported by the model, not sure how it handles multiple images
17631766 for (int i = 0 ; i < conditioner_params.ref_images .size (); i++) {
17641767 sd_image_f32_t image = sd_image_t_to_sd_image_f32_t (*conditioner_params.ref_images [i]);
0 commit comments