@@ -252,7 +252,6 @@ class llm_task {
252252 }
253253 return false ;
254254 }
255- SLOGI (" Processing text: %s" , msg_str.c_str ());
256255
257256 // Convert text to phonemes and tones
258257 std::vector<int > phones_bef, tones_bef;
@@ -262,8 +261,6 @@ class llm_task {
262261 int phone_len = phones.size ();
263262 std::vector<int > langids (phone_len, 3 );
264263
265- SLOGI (" Phoneme conversion completed, length: %d" , phone_len);
266-
267264 // Run the encoder to generate hidden representations
268265 auto encoder_output =
269266 encoder_->Run (phones, tones, langids, g_matrix, mode_config_.noise_scale , mode_config_.noise_scale_w ,
@@ -273,27 +270,19 @@ class llm_task {
273270 auto zp_info = encoder_output.at (0 ).GetTensorTypeAndShapeInfo ();
274271 auto zp_shape = zp_info.GetShape ();
275272
276- SLOGI (" Encoder output completed, shape: [%ld, %ld, %ld], expected audio length: %d" , zp_shape[0 ],
277- zp_shape[1 ], zp_shape[2 ], audio_len);
278-
279273 // Calculate decoder parameters
280274 int zp_size = decoder_->GetInputSize (0 ) / sizeof (float );
281275 int dec_len = zp_size / zp_shape[1 ];
282276 int audio_slice_len = decoder_->GetOutputSize (0 ) / sizeof (float );
283277
284- const int pad_frames = 16 ;
278+ const int pad_frames = 24 ;
285279 const int samples_per_frame = 512 ;
286280
287- SLOGI (" Decoder configuration: frame length=%d, audio slice length=%d, pad length=%d, samples per frame=%d" ,
288- dec_len, audio_slice_len, pad_frames, samples_per_frame);
289-
290281 const int effective_frames = dec_len - 2 * pad_frames;
291282
292283 int dec_slice_num =
293284 static_cast <int >(std::ceil (static_cast <double >(zp_shape[2 ]) / static_cast <double >(effective_frames)));
294285
295- SLOGI (" Will perform %d inferences, each with effective frames: %d" , dec_slice_num, effective_frames);
296-
297286 // SOLA parameters setup
298287 const int sola_buffer_frame = pad_frames * samples_per_frame; // Overlap buffer length
299288 const int sola_search_frame = pad_frames * samples_per_frame; // Search window length
@@ -344,10 +333,6 @@ class llm_task {
344333 output_start_frame = i * effective_frames;
345334 output_end_frame = (i + 1 ) * effective_frames - 1 ;
346335 }
347-
348- SLOGI (" Inference #%d: input frame range=[%d-%d], actual length=%d, output frame range=[%d-%d]" , i + 1 ,
349- input_start, input_start + actual_len - 1 , actual_len, output_start_frame, output_end_frame);
350-
351336 // Prepare decoder input, initialize all to zero
352337 std::vector<float > zp (zp_size, 0 );
353338
@@ -365,8 +350,6 @@ class llm_task {
365350 decoder_->SetInput (zp.data (), 0 );
366351 decoder_->SetInput (g_matrix.data (), 1 );
367352
368- SLOGI (" Inference #%d: starting decoding..." , i + 1 );
369-
370353 if (0 != decoder_->Run ()) {
371354 SLOGI (" Inference #%d: decoding failed" , i + 1 );
372355 throw std::string (" decoder_ RunSync error" );
@@ -416,10 +399,6 @@ class llm_task {
416399
417400 first_frame = false ;
418401
419- SLOGI (
420- " Inference #%d: First frame processing, added %d samples from position %d to output, saved %d "
421- " samples to SOLA buffer" ,
422- i + 1 , audio_len, audio_start, sola_buffer_frame);
423402 } else {
424403 // Non-first frame: SOLA alignment required
425404 int audio_start = pad_frames * samples_per_frame;
@@ -451,9 +430,6 @@ class llm_task {
451430 }
452431 }
453432
454- SLOGI (" Inference #%d: SOLA found best alignment offset %d with correlation coefficient %f" , i + 1 ,
455- best_offset, best_correlation);
456-
457433 // 3. Apply alignment offset
458434 int aligned_start = audio_start + best_offset;
459435
@@ -482,9 +458,6 @@ class llm_task {
482458 int remaining_len =
483459 std::min (remaining_needed, static_cast <int >(decoder_output.size () - remaining_start));
484460
485- SLOGI (" Inference #%d (final): Expected total=%d, processed=%d, needed=%d, available=%d" , i + 1 ,
486- total_expected_samples, processed_samples, remaining_needed, remaining_len);
487-
488461 if (remaining_len > 0 ) {
489462 pcmlist.insert (pcmlist.end (), decoder_output.begin () + remaining_start,
490463 decoder_output.begin () + remaining_start + remaining_len);
@@ -514,50 +487,34 @@ class llm_task {
514487 }
515488 std::fill (sola_buffer.begin () + avail, sola_buffer.end (), 0 .0f );
516489 }
517-
518- SLOGI (" Inference #%d: Added %d + %d samples to output, cumulative length: %zu" , i + 1 ,
519- sola_buffer_frame, remaining_len, pcmlist.size ());
520490 }
521491 }
522492 }
523493
524- SLOGI (" All inference completed, raw generated PCM length: %zu" , pcmlist.size ());
525-
526494 if (pcmlist.size () > audio_len) {
527- SLOGI (" Truncating output from %zu to %d samples as per encoder prediction" , pcmlist.size (), audio_len);
528495 pcmlist.resize (audio_len);
529496 }
530497
531- SLOGI (" Final PCM length after truncation: %zu" , pcmlist.size ());
532-
533498 // Post-processing: resample and convert to int16
534499 double src_ratio =
535500 static_cast <double >(mode_config_.audio_rate ) / static_cast <double >(mode_config_.mode_rate );
536501 std::vector<float > tmp_pcm ((pcmlist.size () * src_ratio + 1 ));
537502 int len;
538503
539- SLOGI (" Starting audio resampling, source rate: %f, target rate: %f, ratio: %f" ,
540- static_cast <float >(mode_config_.mode_rate ), static_cast <float >(mode_config_.audio_rate ), src_ratio);
541-
542504 resample_audio (pcmlist.data (), pcmlist.size (), tmp_pcm.data (), &len, src_ratio);
543505
544- SLOGI (" Resampling completed, length after resampling: %d" , len);
545-
546506 // Convert to 16-bit PCM
547507 wav_pcm_data.reserve (len);
548508 std::transform (tmp_pcm.begin (), tmp_pcm.begin () + len, std::back_inserter (wav_pcm_data),
549509 [](const auto val) { return static_cast <int16_t >(val * INT16_MAX); });
550510
551- SLOGI (" Final audio length: %zu samples" , wav_pcm_data.size ());
552-
553511 // Call the output callback function with the result
554512 if (out_callback_) {
555513 out_callback_ (
556514 std::string (reinterpret_cast <char *>(wav_pcm_data.data ()), wav_pcm_data.size () * sizeof (int16_t )),
557515 finish);
558516 }
559517
560- SLOGI (" TTS processing completed, output callback invoked" );
561518 } catch (const std::exception &e) {
562519 SLOGI (" TTS processing exception: %s" , e.what ());
563520 return true ;
0 commit comments