Skip to content

Commit fd032bc

Browse files
committed
Split quotes by utf8 characters rather than individual char
1 parent c044a40 commit fd032bc

File tree

1 file changed

+45
-42
lines changed

1 file changed

+45
-42
lines changed

conditioner.hpp

Lines changed: 45 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1648,6 +1648,19 @@ struct LLMEmbedder : public Conditioner {
16481648
}
16491649
}
16501650

1651+
size_t get_utf8_char_len(char c) {
1652+
unsigned char uc = static_cast<unsigned char>(c);
1653+
if ((uc & 0x80) == 0)
1654+
return 1; // ASCII (1 byte)
1655+
if ((uc & 0xE0) == 0xC0)
1656+
return 2; // 2-byte char
1657+
if ((uc & 0xF0) == 0xE0)
1658+
return 3; // 3-byte char (Common for Chinese/Japanese)
1659+
if ((uc & 0xF8) == 0xF0)
1660+
return 4; // 4-byte char (Emojis, etc.)
1661+
return 1; // Fallback (should not happen in valid UTF-8)
1662+
}
1663+
16511664
std::tuple<std::vector<int>, std::vector<float>> tokenize(
16521665
std::string text,
16531666
std::pair<int, int> attn_range,
@@ -1667,16 +1680,6 @@ struct LLMEmbedder : public Conditioner {
16671680
}
16681681
parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
16691682

1670-
// {
1671-
// std::stringstream ss;
1672-
// ss << '[';
1673-
// for (const auto& item : parsed_attention) {
1674-
// ss << "['" << item.first << "', " << item.second << "], ";
1675-
// }
1676-
// ss << ']';
1677-
// LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
1678-
// }
1679-
16801683
std::vector<int> tokens;
16811684
std::vector<float> weights;
16821685

@@ -1685,46 +1688,47 @@ struct LLMEmbedder : public Conditioner {
16851688
float curr_weight = item.second;
16861689

16871690
if (spell_quotes) {
1688-
std::vector<std::string> parts;
1691+
std::string buffer;
16891692
bool in_quote = false;
1690-
std::string current_part;
16911693

1692-
for (char c : curr_text) {
1693-
if (c == '"') {
1694-
if (!current_part.empty()) {
1695-
parts.push_back(current_part);
1696-
current_part.clear();
1694+
size_t i = 0;
1695+
while (i < curr_text.size()) {
1696+
// utf8 character can be 1-4 char
1697+
size_t char_len = get_utf8_char_len(curr_text[i]);
1698+
1699+
// Safety check to prevent reading past end of string
1700+
if (i + char_len > curr_text.size()) {
1701+
char_len = curr_text.size() - i;
1702+
}
1703+
std::string uchar = curr_text.substr(i, char_len);
1704+
i += char_len;
1705+
1706+
if (uchar == "\"") {
1707+
buffer += uchar;
1708+
// If we were accumulating normal text, flush it now
1709+
if (!in_quote) {
1710+
std::vector<int> part_tokens = tokenizer->tokenize(buffer, nullptr);
1711+
tokens.insert(tokens.end(), part_tokens.begin(), part_tokens.end());
1712+
weights.insert(weights.end(), part_tokens.size(), curr_weight);
1713+
buffer.clear();
16971714
}
16981715
in_quote = !in_quote;
16991716
} else {
1700-
current_part += c;
1701-
if (in_quote && current_part.size() == 1) {
1702-
parts.push_back(current_part);
1703-
current_part.clear();
1704-
}
1705-
}
1706-
}
1707-
if (!current_part.empty()) {
1708-
parts.push_back(current_part);
1709-
}
1710-
1711-
for (const auto& part : parts) {
1712-
if (part.empty())
1713-
continue;
1714-
if (part[0] == '"' && part.back() == '"') {
1715-
std::string quoted_content = part.substr(1, part.size() - 2);
1716-
for (char ch : quoted_content) {
1717-
std::string char_str(1, ch);
1718-
std::vector<int> char_tokens = tokenizer->tokenize(char_str, nullptr);
1717+
if (in_quote) {
1718+
std::vector<int> char_tokens = tokenizer->tokenize(uchar, nullptr);
17191719
tokens.insert(tokens.end(), char_tokens.begin(), char_tokens.end());
17201720
weights.insert(weights.end(), char_tokens.size(), curr_weight);
1721+
} else {
1722+
buffer += uchar;
17211723
}
1722-
} else {
1723-
std::vector<int> part_tokens = tokenizer->tokenize(part, nullptr);
1724-
tokens.insert(tokens.end(), part_tokens.begin(), part_tokens.end());
1725-
weights.insert(weights.end(), part_tokens.size(), curr_weight);
17261724
}
17271725
}
1726+
1727+
if (!buffer.empty()) {
1728+
std::vector<int> part_tokens = tokenizer->tokenize(buffer, nullptr);
1729+
tokens.insert(tokens.end(), part_tokens.begin(), part_tokens.end());
1730+
weights.insert(weights.end(), part_tokens.size(), curr_weight);
1731+
}
17281732
} else {
17291733
std::vector<int> curr_tokens = tokenizer->tokenize(curr_text, nullptr);
17301734
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
@@ -1751,14 +1755,13 @@ struct LLMEmbedder : public Conditioner {
17511755
LOG_INFO("LongCatEditPipeline");
17521756
prompt_template_encode_start_idx = 67;
17531757
// prompt_template_encode_end_idx = 5;
1754-
int image_embed_idx = 36 + 6;
1758+
int image_embed_idx = 36 + 6;
17551759

17561760
int min_pixels = 384 * 384;
17571761
int max_pixels = 560 * 560;
17581762
std::string placeholder = "<|image_pad|>";
17591763
std::string img_prompt;
17601764

1761-
17621765
// Only one image is officicially supported by the model, not sure how it handles multiple images
17631766
for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
17641767
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);

0 commit comments

Comments
 (0)