@@ -120,6 +120,14 @@ class llm_task {
120120 buffer_t *pcmdata;
121121 std::function<void (void )> pause;
122122
123+ bool is_using_itn () const
124+ {
125+ if (engine_type_ == ENGINE_ONNX) {
126+ return onnx_asr_config_.model_config .sense_voice .use_itn ;
127+ }
128+ return false ;
129+ }
130+
123131 bool parse_config (const nlohmann::json &config_body)
124132 {
125133 try {
@@ -546,91 +554,146 @@ class llm_task {
546554
547555 void sys_pcm_on_data_onnx (const std::string &raw)
548556 {
549- if (raw.size () >= sizeof (int16_t )) {
550- const int16_t *pcm16 = reinterpret_cast <const int16_t *>(raw.data ());
551- size_t n16 = raw.size () / sizeof (int16_t );
552- PushPreRollPcm (pcm16, n16);
553- }
554-
555- static int count = 0 ;
556- if (count < delay_audio_frame_) {
557+ if (delay_audio_frame_ == 0 ) {
557558 buffer_write_char (pcmdata, raw.data (), raw.length ());
558- count++;
559- return ;
560- }
559+ buffer_position_set (pcmdata, 0 );
561560
562- buffer_write_char (pcmdata, raw.data (), raw.length ());
563- buffer_position_set (pcmdata, 0 );
561+ std::vector<float > floatSamples;
562+ int16_t audio_val;
563+ while (buffer_read_i16 (pcmdata, &audio_val, 1 )) {
564+ float normalizedSample = static_cast <float >(audio_val) / INT16_MAX;
565+ floatSamples.push_back (normalizedSample);
566+ }
564567
565- std::vector<float > floatSamples;
566- floatSamples.reserve ((delay_audio_frame_ + 1 ) * kFrameSamples );
568+ buffer_resize (pcmdata, 0 );
569+ int32_t window_size = vad_config_.silero_vad .window_size ;
570+ int32_t i = 0 ;
571+ std::string final_text;
567572
568- int16_t audio_val;
569- while (buffer_read_i16 (pcmdata, &audio_val, 1 )) {
570- floatSamples.push_back (static_cast <float >(audio_val) / 32768 .0f );
571- }
573+ while (i < floatSamples.size ()) {
574+ if (i + window_size <= floatSamples.size ()) {
575+ vad_->AcceptWaveform (floatSamples.data () + i, window_size);
576+ } else {
577+ vad_->Flush ();
578+ }
579+ i += window_size;
572580
573- buffer_resize (pcmdata, 0 );
574- count = 0 ;
581+ while (!vad_->Empty ()) {
582+ const auto &segment = vad_->Front ();
583+ float duration = segment.samples .size () / 16000 .f ;
584+ float start_time = segment.start / 16000 .f ;
585+ float end_time = start_time + duration;
575586
576- vad_->AcceptWaveform (floatSamples.data (), floatSamples.size ());
587+ if (duration < 0 .1f ) {
588+ vad_->Pop ();
589+ continue ;
590+ }
577591
578- bool detected = vad_->IsSpeechDetected ();
579- bool speech_start = (!prev_vad_detected_ && detected);
580- prev_vad_detected_ = detected;
592+ if (!offline_stream_) offline_stream_ = onnx_recognizer_->CreateStream ();
581593
582- while (!vad_-> Empty ()) {
583- const auto &segment = vad_-> Front ( );
594+ offline_stream_-> AcceptWaveform (onnx_asr_config_. feat_config . sampling_rate , segment. samples . data (),
595+ segment. samples . size () );
584596
585- if (!offline_stream_) {
586- offline_stream_ = onnx_recognizer_->CreateStream ();
587- }
597+ onnx_recognizer_->DecodeStream (offline_stream_.get ());
598+ const auto &result = offline_stream_->GetResult ();
588599
589- if (speech_start && !pre_roll_pcm_.empty ()) {
590- std::vector<float > pre ;
591- pre .reserve (pre_roll_pcm_.size ());
592- for (int16_t s : pre_roll_pcm_) {
593- pre .push_back (static_cast <float >(s) / 32768 .0f );
594- }
600+ final_text += result.text ;
595601
596- std::vector< float > merged ;
597- merged. reserve ( pre . size () + segment. samples . size () );
598- merged. insert (merged. end (), pre . begin (), pre . end ());
599- merged. insert (merged. end (), segment. samples . begin (), segment. samples . end ());
602+ vad_-> Pop () ;
603+ offline_stream_. reset ( );
604+ }
605+ }
600606
601- offline_stream_->AcceptWaveform (kSampleRate , merged.data (), merged.size ());
607+ if (out_callback_) {
608+ out_callback_ (final_text, true );
609+ }
610+ } else {
611+ if (raw.size () >= sizeof (int16_t )) {
612+ const int16_t *pcm16 = reinterpret_cast <const int16_t *>(raw.data ());
613+ size_t n16 = raw.size () / sizeof (int16_t );
614+ PushPreRollPcm (pcm16, n16);
615+ }
602616
603- pre_roll_pcm_.clear ();
604- speech_start = false ;
605- } else {
606- offline_stream_->AcceptWaveform (kSampleRate , segment.samples .data (), segment.samples .size ());
617+ static int count = 0 ;
618+ if (count < delay_audio_frame_) {
619+ buffer_write_char (pcmdata, raw.data (), raw.length ());
620+ count++;
621+ return ;
607622 }
608623
609- onnx_recognizer_->DecodeStream (offline_stream_.get ());
624+ buffer_write_char (pcmdata, raw.data (), raw.length ());
625+ buffer_position_set (pcmdata, 0 );
626+
627+ std::vector<float > floatSamples;
628+ floatSamples.reserve ((delay_audio_frame_ + 1 ) * kFrameSamples );
610629
611- const auto &result = offline_stream_-> GetResult () ;
612- if (!result. text . empty () && out_callback_ ) {
613- out_callback_ (result. text , true );
630+ int16_t audio_val ;
631+ while ( buffer_read_i16 (pcmdata, &audio_val, 1 ) ) {
632+ floatSamples. push_back ( static_cast < float >(audio_val) / 32768 . 0f );
614633 }
615634
616- vad_->Pop ();
635+ buffer_resize (pcmdata, 0 );
636+ count = 0 ;
617637
618- offline_stream_.reset ();
619- }
638+ vad_->AcceptWaveform (floatSamples.data (), floatSamples.size ());
620639
621- {
622- float chunk_ms = (delay_audio_frame_ + 1 ) * 10 .0f ;
623- if (detected) {
624- silence_ms_accum_ = 0 .0f ;
625- } else {
626- silence_ms_accum_ += chunk_ms;
640+ bool detected = vad_->IsSpeechDetected ();
641+ bool speech_start = (!prev_vad_detected_ && detected);
642+ prev_vad_detected_ = detected;
643+
644+ while (!vad_->Empty ()) {
645+ const auto &segment = vad_->Front ();
646+
647+ if (!offline_stream_) {
648+ offline_stream_ = onnx_recognizer_->CreateStream ();
649+ }
650+
651+ if (speech_start && !pre_roll_pcm_.empty ()) {
652+ std::vector<float > pre ;
653+ pre .reserve (pre_roll_pcm_.size ());
654+ for (int16_t s : pre_roll_pcm_) {
655+ pre .push_back (static_cast <float >(s) / 32768 .0f );
656+ }
657+
658+ std::vector<float > merged;
659+ merged.reserve (pre .size () + segment.samples .size ());
660+ merged.insert (merged.end (), pre .begin (), pre .end ());
661+ merged.insert (merged.end (), segment.samples .begin (), segment.samples .end ());
662+
663+ offline_stream_->AcceptWaveform (kSampleRate , merged.data (), merged.size ());
664+
665+ pre_roll_pcm_.clear ();
666+ speech_start = false ;
667+ } else {
668+ offline_stream_->AcceptWaveform (kSampleRate , segment.samples .data (), segment.samples .size ());
669+ }
670+
671+ onnx_recognizer_->DecodeStream (offline_stream_.get ());
672+
673+ const auto &result = offline_stream_->GetResult ();
674+ if (!result.text .empty () && out_callback_) {
675+ out_callback_ (result.text , true );
676+ }
677+
678+ vad_->Pop ();
679+
680+ offline_stream_.reset ();
627681 }
628682
629- if (silence_ms_accum_ >= silence_timeout) {
630- if (ensleep_) {
631- if (pause) pause ();
683+ {
684+ float chunk_ms = (delay_audio_frame_ + 1 ) * 10 .0f ;
685+ if (detected) {
686+ silence_ms_accum_ = 0 .0f ;
687+ } else {
688+ silence_ms_accum_ += chunk_ms;
689+ }
690+
691+ if (silence_ms_accum_ >= silence_timeout) {
692+ if (ensleep_) {
693+ if (pause) pause ();
694+ }
695+ silence_ms_accum_ = 0 .0f ;
632696 }
633- silence_ms_accum_ = 0 .0f ;
634697 }
635698 }
636699 }
@@ -755,8 +818,10 @@ class llm_asr : public StackFlow {
755818 std::string tmp_msg1;
756819 const std::string *next_data = &data;
757820 if (finish) {
758- tmp_msg1 = data + " ." ;
759- next_data = &tmp_msg1;
821+ if (!llm_task_obj->is_using_itn ()) {
822+ tmp_msg1 = data + " ." ;
823+ next_data = &tmp_msg1;
824+ }
760825 }
761826
762827 if (llm_channel->enstream_ ) {
0 commit comments