Skip to content

Commit cbb0afa

Browse files
committed
Optimize G2P process to skip inference for short audio clips
Improved the Grapheme-to-Phoneme (G2P) process by eliminating separate inference operations for audio segments that are too short, enhancing processing efficiency.
1 parent b1df925 commit cbb0afa

File tree

1 file changed

+121
-9
lines changed
  • projects/llm_framework/main_melotts/src/runner

1 file changed

+121
-9
lines changed

projects/llm_framework/main_melotts/src/runner/Lexicon.hpp

Lines changed: 121 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -195,16 +195,99 @@ class Lexicon {
195195
phonetic_str.c_str());
196196
}
197197

198-
void convert(const std::string& text, std::vector<int>& phones, std::vector<int>& tones)
198+
std::vector<std::string> splitTextByPunctuation(const std::string& text)
199+
{
200+
std::vector<std::string> segments;
201+
auto chars = splitEachChar(text);
202+
std::string current_segment;
203+
204+
for (size_t i = 0; i < chars.size(); ++i) {
205+
std::string c = chars[i];
206+
current_segment += c;
207+
208+
bool is_segment_punct = false;
209+
std::string punct_key = c;
210+
211+
if (c == "")
212+
punct_key = ",";
213+
else if (c == "")
214+
punct_key = ".";
215+
else if (c == "")
216+
punct_key = "!";
217+
else if (c == "")
218+
punct_key = "?";
219+
220+
if (lexicon.find(punct_key) != lexicon.end() &&
221+
(punct_key == "." || punct_key == "!" || punct_key == "?" || punct_key == "," || punct_key == "")) {
222+
is_segment_punct = true;
223+
}
224+
225+
if (is_segment_punct && i < chars.size() - 1) {
226+
segments.push_back(current_segment);
227+
current_segment.clear();
228+
}
229+
}
230+
231+
if (!current_segment.empty()) {
232+
segments.push_back(current_segment);
233+
}
234+
235+
return segments;
236+
}
237+
std::vector<std::string> mergeShortSegments(const std::vector<std::string>& segments, int min_length = 4)
238+
{
239+
std::vector<std::string> merged_segments;
240+
std::string current_segment;
241+
242+
for (size_t i = 0; i < segments.size(); ++i) {
243+
auto chars = splitEachChar(segments[i]);
244+
int actual_chars = 0;
245+
for (const auto& c : chars) {
246+
if (c != " " && lexicon.find(c) != lexicon.end()) {
247+
std::string punct_key = c;
248+
if (c == "")
249+
punct_key = ",";
250+
else if (c == "")
251+
punct_key = ".";
252+
else if (c == "")
253+
punct_key = "!";
254+
else if (c == "")
255+
punct_key = "?";
256+
257+
if (punct_key != "," && punct_key != "." && punct_key != "!" && punct_key != "?" &&
258+
punct_key != "" && punct_key != "'" && punct_key != "-") {
259+
actual_chars++;
260+
}
261+
} else if (is_english(c)) {
262+
actual_chars++;
263+
}
264+
}
265+
if (actual_chars < min_length && i < segments.size() - 1) {
266+
if (current_segment.empty()) {
267+
current_segment = segments[i];
268+
} else {
269+
current_segment += segments[i];
270+
}
271+
} else {
272+
if (!current_segment.empty()) {
273+
current_segment += segments[i];
274+
merged_segments.push_back(current_segment);
275+
current_segment.clear();
276+
} else {
277+
merged_segments.push_back(segments[i]);
278+
}
279+
}
280+
}
281+
282+
if (!current_segment.empty()) {
283+
merged_segments.push_back(current_segment);
284+
}
285+
286+
return merged_segments;
287+
}
288+
289+
void processSegment(const std::string& text, std::vector<int>& phones, std::vector<int>& tones)
199290
{
200-
DEBUG_LOG("\nStarting text processing: \"%s\"", text.c_str());
201-
DEBUG_LOG("=======Matching Results=======");
202-
DEBUG_LOG("Unit\t|\tPhonemes\t|\tTones");
203-
DEBUG_LOG("-----------------------------");
204-
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
205-
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
206-
DEBUG_LOG("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
207-
tonesToString(unknown_token.second).c_str());
208291
auto chars = splitEachChar(text);
209292
int i = 0;
210293
while (i < chars.size()) {
@@ -274,10 +357,39 @@ class Lexicon {
274357
}
275358
}
276359
}
360+
}
361+
362+
void convert(const std::string& text, std::vector<int>& phones, std::vector<int>& tones)
363+
{
364+
DEBUG_LOG("\nStarting text processing: \"%s\"", text.c_str());
365+
366+
std::vector<std::string> segments = splitTextByPunctuation(text);
367+
368+
std::vector<std::string> merged_segments = mergeShortSegments(segments);
369+
370+
DEBUG_LOG("Text divided into %zu segments after merging short segments", merged_segments.size());
371+
for (size_t i = 0; i < merged_segments.size(); ++i) {
372+
DEBUG_LOG("Segment %zu: \"%s\"", i + 1, merged_segments[i].c_str());
373+
}
374+
375+
DEBUG_LOG("=======Matching Results=======");
376+
DEBUG_LOG("Unit\t|\tPhonemes\t|\tTones");
377+
DEBUG_LOG("-----------------------------");
378+
379+
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
380+
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
381+
DEBUG_LOG("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
382+
tonesToString(unknown_token.second).c_str());
383+
384+
for (const auto& segment : merged_segments) {
385+
processSegment(segment, phones, tones);
386+
}
387+
277388
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
278389
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
279390
DEBUG_LOG("<EOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
280391
tonesToString(unknown_token.second).c_str());
392+
281393
DEBUG_LOG("\nProcessing Summary:");
282394
DEBUG_LOG("Original text: %s", text.c_str());
283395
DEBUG_LOG("Phonemes: %s", phonesToString(phones).c_str());

0 commit comments

Comments
 (0)