Skip to content

Commit 9c7ba31

Browse files
author
LittleMouse
committed
[update] update llm-asr & model config
1 parent 3ab3d87 commit 9c7ba31

File tree

2 files changed

+130
-64
lines changed

2 files changed

+130
-64
lines changed

projects/llm_framework/main_asr/mode_sense-voice-small-10s-ax650.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
],
2121
"mode_param": {
2222
"model_config.sense_voice.model": "model.axmodel",
23+
"model_config.sense_voice.use_itn": true,
2324
"model_config.tokens": "tokens.txt",
2425
"silero_vad.model": "silero_vad.ort",
2526
"model_config.provider": "axera",

projects/llm_framework/main_asr/src/main.cpp

Lines changed: 129 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,14 @@ class llm_task {
120120
buffer_t *pcmdata;
121121
std::function<void(void)> pause;
122122

123+
bool is_using_itn() const
124+
{
125+
if (engine_type_ == ENGINE_ONNX) {
126+
return onnx_asr_config_.model_config.sense_voice.use_itn;
127+
}
128+
return false;
129+
}
130+
123131
bool parse_config(const nlohmann::json &config_body)
124132
{
125133
try {
@@ -546,91 +554,146 @@ class llm_task {
546554

547555
void sys_pcm_on_data_onnx(const std::string &raw)
548556
{
549-
if (raw.size() >= sizeof(int16_t)) {
550-
const int16_t *pcm16 = reinterpret_cast<const int16_t *>(raw.data());
551-
size_t n16 = raw.size() / sizeof(int16_t);
552-
PushPreRollPcm(pcm16, n16);
553-
}
554-
555-
static int count = 0;
556-
if (count < delay_audio_frame_) {
557+
if (delay_audio_frame_ == 0) {
557558
buffer_write_char(pcmdata, raw.data(), raw.length());
558-
count++;
559-
return;
560-
}
559+
buffer_position_set(pcmdata, 0);
561560

562-
buffer_write_char(pcmdata, raw.data(), raw.length());
563-
buffer_position_set(pcmdata, 0);
561+
std::vector<float> floatSamples;
562+
int16_t audio_val;
563+
while (buffer_read_i16(pcmdata, &audio_val, 1)) {
564+
float normalizedSample = static_cast<float>(audio_val) / INT16_MAX;
565+
floatSamples.push_back(normalizedSample);
566+
}
564567

565-
std::vector<float> floatSamples;
566-
floatSamples.reserve((delay_audio_frame_ + 1) * kFrameSamples);
568+
buffer_resize(pcmdata, 0);
569+
int32_t window_size = vad_config_.silero_vad.window_size;
570+
int32_t i = 0;
571+
std::string final_text;
567572

568-
int16_t audio_val;
569-
while (buffer_read_i16(pcmdata, &audio_val, 1)) {
570-
floatSamples.push_back(static_cast<float>(audio_val) / 32768.0f);
571-
}
573+
while (i < floatSamples.size()) {
574+
if (i + window_size <= floatSamples.size()) {
575+
vad_->AcceptWaveform(floatSamples.data() + i, window_size);
576+
} else {
577+
vad_->Flush();
578+
}
579+
i += window_size;
572580

573-
buffer_resize(pcmdata, 0);
574-
count = 0;
581+
while (!vad_->Empty()) {
582+
const auto &segment = vad_->Front();
583+
float duration = segment.samples.size() / 16000.f;
584+
float start_time = segment.start / 16000.f;
585+
float end_time = start_time + duration;
575586

576-
vad_->AcceptWaveform(floatSamples.data(), floatSamples.size());
587+
if (duration < 0.1f) {
588+
vad_->Pop();
589+
continue;
590+
}
577591

578-
bool detected = vad_->IsSpeechDetected();
579-
bool speech_start = (!prev_vad_detected_ && detected);
580-
prev_vad_detected_ = detected;
592+
if (!offline_stream_) offline_stream_ = onnx_recognizer_->CreateStream();
581593

582-
while (!vad_->Empty()) {
583-
const auto &segment = vad_->Front();
594+
offline_stream_->AcceptWaveform(onnx_asr_config_.feat_config.sampling_rate, segment.samples.data(),
595+
segment.samples.size());
584596

585-
if (!offline_stream_) {
586-
offline_stream_ = onnx_recognizer_->CreateStream();
587-
}
597+
onnx_recognizer_->DecodeStream(offline_stream_.get());
598+
const auto &result = offline_stream_->GetResult();
588599

589-
if (speech_start && !pre_roll_pcm_.empty()) {
590-
std::vector<float> pre;
591-
pre.reserve(pre_roll_pcm_.size());
592-
for (int16_t s : pre_roll_pcm_) {
593-
pre.push_back(static_cast<float>(s) / 32768.0f);
594-
}
600+
final_text += result.text;
595601

596-
std::vector<float> merged;
597-
merged.reserve(pre.size() + segment.samples.size());
598-
merged.insert(merged.end(), pre.begin(), pre.end());
599-
merged.insert(merged.end(), segment.samples.begin(), segment.samples.end());
602+
vad_->Pop();
603+
offline_stream_.reset();
604+
}
605+
}
600606

601-
offline_stream_->AcceptWaveform(kSampleRate, merged.data(), merged.size());
607+
if (out_callback_) {
608+
out_callback_(final_text, true);
609+
}
610+
} else {
611+
if (raw.size() >= sizeof(int16_t)) {
612+
const int16_t *pcm16 = reinterpret_cast<const int16_t *>(raw.data());
613+
size_t n16 = raw.size() / sizeof(int16_t);
614+
PushPreRollPcm(pcm16, n16);
615+
}
602616

603-
pre_roll_pcm_.clear();
604-
speech_start = false;
605-
} else {
606-
offline_stream_->AcceptWaveform(kSampleRate, segment.samples.data(), segment.samples.size());
617+
static int count = 0;
618+
if (count < delay_audio_frame_) {
619+
buffer_write_char(pcmdata, raw.data(), raw.length());
620+
count++;
621+
return;
607622
}
608623

609-
onnx_recognizer_->DecodeStream(offline_stream_.get());
624+
buffer_write_char(pcmdata, raw.data(), raw.length());
625+
buffer_position_set(pcmdata, 0);
626+
627+
std::vector<float> floatSamples;
628+
floatSamples.reserve((delay_audio_frame_ + 1) * kFrameSamples);
610629

611-
const auto &result = offline_stream_->GetResult();
612-
if (!result.text.empty() && out_callback_) {
613-
out_callback_(result.text, true);
630+
int16_t audio_val;
631+
while (buffer_read_i16(pcmdata, &audio_val, 1)) {
632+
floatSamples.push_back(static_cast<float>(audio_val) / 32768.0f);
614633
}
615634

616-
vad_->Pop();
635+
buffer_resize(pcmdata, 0);
636+
count = 0;
617637

618-
offline_stream_.reset();
619-
}
638+
vad_->AcceptWaveform(floatSamples.data(), floatSamples.size());
620639

621-
{
622-
float chunk_ms = (delay_audio_frame_ + 1) * 10.0f;
623-
if (detected) {
624-
silence_ms_accum_ = 0.0f;
625-
} else {
626-
silence_ms_accum_ += chunk_ms;
640+
bool detected = vad_->IsSpeechDetected();
641+
bool speech_start = (!prev_vad_detected_ && detected);
642+
prev_vad_detected_ = detected;
643+
644+
while (!vad_->Empty()) {
645+
const auto &segment = vad_->Front();
646+
647+
if (!offline_stream_) {
648+
offline_stream_ = onnx_recognizer_->CreateStream();
649+
}
650+
651+
if (speech_start && !pre_roll_pcm_.empty()) {
652+
std::vector<float> pre;
653+
pre.reserve(pre_roll_pcm_.size());
654+
for (int16_t s : pre_roll_pcm_) {
655+
pre.push_back(static_cast<float>(s) / 32768.0f);
656+
}
657+
658+
std::vector<float> merged;
659+
merged.reserve(pre.size() + segment.samples.size());
660+
merged.insert(merged.end(), pre.begin(), pre.end());
661+
merged.insert(merged.end(), segment.samples.begin(), segment.samples.end());
662+
663+
offline_stream_->AcceptWaveform(kSampleRate, merged.data(), merged.size());
664+
665+
pre_roll_pcm_.clear();
666+
speech_start = false;
667+
} else {
668+
offline_stream_->AcceptWaveform(kSampleRate, segment.samples.data(), segment.samples.size());
669+
}
670+
671+
onnx_recognizer_->DecodeStream(offline_stream_.get());
672+
673+
const auto &result = offline_stream_->GetResult();
674+
if (!result.text.empty() && out_callback_) {
675+
out_callback_(result.text, true);
676+
}
677+
678+
vad_->Pop();
679+
680+
offline_stream_.reset();
627681
}
628682

629-
if (silence_ms_accum_ >= silence_timeout) {
630-
if (ensleep_) {
631-
if (pause) pause();
683+
{
684+
float chunk_ms = (delay_audio_frame_ + 1) * 10.0f;
685+
if (detected) {
686+
silence_ms_accum_ = 0.0f;
687+
} else {
688+
silence_ms_accum_ += chunk_ms;
689+
}
690+
691+
if (silence_ms_accum_ >= silence_timeout) {
692+
if (ensleep_) {
693+
if (pause) pause();
694+
}
695+
silence_ms_accum_ = 0.0f;
632696
}
633-
silence_ms_accum_ = 0.0f;
634697
}
635698
}
636699
}
@@ -755,8 +818,10 @@ class llm_asr : public StackFlow {
755818
std::string tmp_msg1;
756819
const std::string *next_data = &data;
757820
if (finish) {
758-
tmp_msg1 = data + ".";
759-
next_data = &tmp_msg1;
821+
if (!llm_task_obj->is_using_itn()) {
822+
tmp_msg1 = data + ".";
823+
next_data = &tmp_msg1;
824+
}
760825
}
761826

762827
if (llm_channel->enstream_) {

0 commit comments

Comments
 (0)