11#pragma once
22
3+ // Configure FFT to output 16 bit fixed point.
4+ #define FIXED_POINT 16
5+
36#include < TensorFlowLite.h>
47
58#include < cmath>
@@ -73,7 +76,7 @@ struct TfLiteConfig {
7376 // the frequency information. This has to be a power of two, and since
7477 // we're dealing with 30ms of 16KHz inputs, which means 480 samples, this
7578 // is the next value.
76- int kMaxAudioSampleSize = 480 ;
79+ // int kMaxAudioSampleSize = 320; //512; // 480
7780 int kAudioSampleFrequency = 16000 ;
7881
7982 // Number of audio channels - is usually 1. If 2 we reduce it to 1 by averaging the 2 channels
@@ -90,6 +93,8 @@ struct TfLiteConfig {
9093 int kSlicesToProcess = 3 ;
9194
9295 int featureElementCount () { return kFeatureSliceSize * kFeatureSliceCount ; }
96+ int audioSampleSize () { return kFeatureSliceDurationMs * (kAudioSampleFrequency / 1000 ); }
97+ int strideSampleSize () {return kFeatureSliceStrideMs * (kAudioSampleFrequency / 1000 );}
9398
9499 // Parameters for RecognizeCommands
95100 int32_t average_window_duration_ms = 1000 ;
@@ -190,7 +195,7 @@ class TfLiteResultsQueue {
190195template <int N>
191196class TfLiteAbstractRecognizeCommands {
192197 public:
193- virtual TfLiteStatus ProcessLatestResults (const TfLiteTensor* latest_results,
198+ virtual TfLiteStatus processLatestResults (const TfLiteTensor* latest_results,
194199 const int32_t current_time_ms,
195200 const char ** found_command, uint8_t * score,
196201 bool * is_new_command) = 0;
@@ -225,32 +230,41 @@ class TfLiteRecognizeCommands : public TfLiteAbstractRecognizeCommands<N> {
225230 // further recognitions for a set time after one has been triggered, which can
226231 // help reduce spurious recognitions.
227232
228- explicit TfLiteRecognizeCommands () {
233+ TfLiteRecognizeCommands () {
229234 previous_top_label_ = " silence" ;
230235 previous_top_label_time_ = std::numeric_limits<int32_t >::min ();
231236 kCategoryCount = N;
232237 }
233238
234239 // / Setup parameters from config
235240 bool begin (TfLiteConfig cfg) override {
241+ if (kCategoryCount ==0 ){
242+ LOGE (" kCategoryCount must not be 0" );
243+ return false ;
244+ }
245+ if (cfg.labels ==nullptr ){
246+ LOGE (" config.labels not defined" );
247+ return false ;
248+ }
236249 average_window_duration_ms_ = cfg.average_window_duration_ms ;
237250 detection_threshold_ = cfg.detection_threshold ;
238251 suppression_ms_ = cfg.suppression_ms ;
239252 minimum_count_ = cfg.minimum_count ;
240253 kCategoryLabels = cfg.labels ;
241- if (cfg.labels ==0 ){
242- LOGW (" config.labels not defined" );
243- return false ;
244- }
254+ started = true ;
245255 return true ;
246256 }
247257
248258 // Call this with the results of running a model on sample data.
249- virtual TfLiteStatus ProcessLatestResults (const TfLiteTensor* latest_results,
259+ virtual TfLiteStatus processLatestResults (const TfLiteTensor* latest_results,
250260 const int32_t current_time_ms,
251261 const char ** found_command, uint8_t * score,
252262 bool * is_new_command) override {
253263 LOGD (LOG_METHOD);
264+ if (!started){
265+ LOGE (" TfLiteRecognizeCommands not started" );
266+ return kTfLiteError ;
267+ }
254268 if ((latest_results->dims ->size != 2 ) ||
255269 (latest_results->dims ->data [0 ] != 1 ) ||
256270 (latest_results->dims ->data [1 ] != kCategoryCount )) {
@@ -359,6 +373,7 @@ class TfLiteRecognizeCommands : public TfLiteAbstractRecognizeCommands<N> {
359373 int32_t minimum_count_;
360374 int kCategoryCount ;
361375 const char ** kCategoryLabels = nullptr ;
376+ bool started = false ;
362377
363378 // Working variables
364379 TfLiteResultsQueue<N> previous_results_;
@@ -385,13 +400,20 @@ class TfLiteAudioFeatureProvider {
385400 virtual bool begin (TfLiteConfig config) {
386401 LOGD (LOG_METHOD);
387402 cfg = config;
403+ kMaxAudioSampleSize = cfg.audioSampleSize ();
404+ kStrideSampleSize = cfg.strideSampleSize ();
405+ kKeepSampleSize = kMaxAudioSampleSize - kStrideSampleSize ;
406+
407+ // Allocate ring buffer
388408 if (p_buffer == nullptr ) {
389- p_buffer = new audio_tools::RingBuffer<int16_t >(cfg. kMaxAudioSampleSize );
390- LOGD (" Allocating buffer for %d samples" , cfg. kMaxAudioSampleSize );
409+ p_buffer = new audio_tools::RingBuffer<int16_t >(kMaxAudioSampleSize );
410+ LOGD (" Allocating buffer for %d samples" , kMaxAudioSampleSize );
391411 }
412+
392413 // Initialize the feature data to default values.
393414 if (feature_data_ == nullptr ) {
394- feature_data_ = new int8_t [cfg.featureElementCount ()]{}; // initialzed array
415+ feature_data_ = new int8_t [cfg.featureElementCount ()];
416+ memset (feature_data_,0 , cfg.featureElementCount ());
395417 }
396418
397419 TfLiteStatus init_status = initializeMicroFeatures ();
@@ -428,15 +450,13 @@ class TfLiteAudioFeatureProvider {
428450
429451 protected:
430452 TfLiteConfig cfg;
431- // int feature_size_;
432453 int8_t * feature_data_ = nullptr ;
433- // Make sure we don't try to use cached information if this is the first
434- // call into the provider.
435- bool is_first_run_ = true ;
436- bool g_is_first_time = true ;
437- // const char** kCategoryLabels;
438454 audio_tools::RingBuffer<int16_t >* p_buffer = nullptr ;
439455 FrontendState g_micro_features_state;
456+ FrontendConfig config;
457+ int kMaxAudioSampleSize ;
458+ int kStrideSampleSize ;
459+ int kKeepSampleSize ;
440460
441461 // If we can avoid recalculating some slices, just move the existing
442462 // data up in the spectrogram, to perform something like this: last time
@@ -452,26 +472,32 @@ class TfLiteAudioFeatureProvider {
452472 // +-----------+ +-----------+
453473 virtual void addSlice () {
454474 LOGD (LOG_METHOD);
475+ // shift feature_data_ by one slice one one
455476 memmove (feature_data_, feature_data_ + cfg.kFeatureSliceSize ,
456477 (cfg.kFeatureSliceCount - 1 ) * cfg.kFeatureSliceSize );
457478
458479 // copy data from buffer to audio_samples
459- int16_t audio_samples[cfg.kMaxAudioSampleSize ];
460- int audio_samples_size =
461- p_buffer->readArray (audio_samples, cfg.kMaxAudioSampleSize );
480+ int16_t audio_samples[kMaxAudioSampleSize ];
481+ int audio_samples_size = p_buffer->readArray (audio_samples, kMaxAudioSampleSize );
482+
483+ // check size
484+ if (audio_samples_size!=kMaxAudioSampleSize ){
485+ LOGE (" audio_samples_size=%d != kMaxAudioSampleSize=%d" ,audio_samples_size, kMaxAudioSampleSize );
486+ }
462487
488+ // keep some data to be reprocessed - move by kStrideSampleSize
489+ p_buffer->writeArray (audio_samples+kStrideSampleSize , kKeepSampleSize );
463490
464491 // the new slice data will always be stored at the end
465- int8_t * new_slice_data =
466- feature_data_ + ((cfg.kFeatureSliceCount - 1 ) * cfg.kFeatureSliceSize );
467- size_t num_samples_read = audio_samples_size;
492+ int8_t * new_slice_data = feature_data_ + ((cfg.kFeatureSliceCount - 1 ) * cfg.kFeatureSliceSize );
493+ size_t num_samples_read = 0 ;
468494 if (generateMicroFeatures (audio_samples, audio_samples_size,
469- cfg.kFeatureSliceSize , new_slice_data,
495+ new_slice_data, cfg.kFeatureSliceSize ,
470496 &num_samples_read) != kTfLiteOk ) {
471497 LOGE (" Error generateMicroFeatures" );
472498 }
473499
474- // printFeatures();
500+ // printFeatures();
475501 }
476502
477503 // / For debugging: print feature matrix
@@ -483,11 +509,11 @@ class TfLiteAudioFeatureProvider {
483509 }
484510 Serial.println ();
485511 }
512+ Serial.println (" ------------" );
486513 }
487514
488515 virtual TfLiteStatus initializeMicroFeatures () {
489516 LOGD (LOG_METHOD);
490- FrontendConfig config;
491517 config.window .size_ms = cfg.kFeatureSliceDurationMs ;
492518 config.window .step_size_ms = cfg.kFeatureSliceStrideMs ;
493519 config.noise_reduction .smoothing_bits = 10 ;
@@ -506,38 +532,42 @@ class TfLiteAudioFeatureProvider {
506532 config.log_scale .scale_shift = 6 ;
507533 if (!FrontendPopulateState (&config, &g_micro_features_state,
508534 cfg.kAudioSampleFrequency )) {
509- LOGE (" FrontendPopulateState () failed" );
535+ LOGE (" frontendPopulateState () failed" );
510536 return kTfLiteError ;
511537 }
512- g_is_first_time = true ;
513538 return kTfLiteOk ;
514539 }
515540
516- // This is not exposed in any header, and is only used for testing, to ensure
517- // that the state is correctly set up before generating results.
518- void setMicroFeaturesNoiseEstimates (const uint32_t * estimate_presets) {
519- LOGD (LOG_METHOD);
520- for (int i = 0 ; i < g_micro_features_state.filterbank .num_channels ; ++i) {
521- g_micro_features_state.noise_reduction .estimate [i] = estimate_presets[i];
522- }
523- }
541+ // // This is not exposed in any header, and is only used for testing, to ensure
542+ // // that the state is correctly set up before generating results.
543+ // void setMicroFeaturesNoiseEstimates(const uint32_t* estimate_presets) {
544+ // LOGD(LOG_METHOD);
545+ // for (int i = 0; i < g_micro_features_state.filterbank.num_channels; ++i) {
546+ // g_micro_features_state.noise_reduction.estimate[i] = estimate_presets[i];
547+ // }
548+ // }
524549
525550 virtual TfLiteStatus generateMicroFeatures (const int16_t * input, int input_size,
526- int output_size, int8_t * output,
551+ int8_t * output, int output_size,
527552 size_t * num_samples_read) {
528553 LOGD (LOG_METHOD);
529- const int16_t * frontend_input;
530- if (g_is_first_time) {
531- frontend_input = input;
532- g_is_first_time = false ;
533- } else {
534- frontend_input = input;
535- }
554+ const int16_t * frontend_input=input;
536555
537556 // Apply FFT
538557 FrontendOutput frontend_output = FrontendProcessSamples (
539558 &g_micro_features_state, frontend_input, input_size, num_samples_read);
540559
560+ // Check size
561+ if (output_size != frontend_output.size ){
562+ LOGE (" output_size=%d, frontend_output.size=%d" ,output_size, frontend_output.size );
563+ }
564+
565+ // // check generated features
566+ // if (input_size != *num_samples_read){
567+ // LOGE("audio_samples_size=%d vs num_samples_read=%d", input_size, *num_samples_read);
568+ // }
569+
570+
541571 for (size_t i = 0 ; i < frontend_output.size ; ++i) {
542572 // These scaling values are derived from those used in input_data.py in
543573 // the training pipeline. The feature pipeline outputs 16-bit signed
@@ -675,7 +705,7 @@ class TfLiteAudioOutput : public AudioPrint {
675705 // we submit int16 data which will be reduced to 8bits so we can send
676706 // double the amount - 2 channels will be recuced to 1 so we multiply by
677707 // number of channels
678- int maxBytes = cfg.kMaxAudioSampleSize * 2 * cfg.kAudioChannels ;
708+ int maxBytes = cfg.audioSampleSize () * 2 * cfg.kAudioChannels ;
679709 while (open > 0 ) {
680710 int len = min (open, maxBytes);
681711 result += processAudio (audio + pos, len);
@@ -810,10 +840,10 @@ class TfLiteAudioOutput : public AudioPrint {
810840 uint8_t score = 0 ;
811841 bool is_new_command = false ;
812842
813- TfLiteStatus process_status = recognizer->ProcessLatestResults (
843+ TfLiteStatus process_status = recognizer->processLatestResults (
814844 output, current_time, &found_command, &score, &is_new_command);
815845 if (process_status != kTfLiteOk ) {
816- LOGE (" TfLiteRecognizeCommands::ProcessLatestResults () failed" );
846+ LOGE (" TfLiteRecognizeCommands::processLatestResults () failed" );
817847 return 0 ;
818848 }
819849 // Do something based on the recognized command. The default
0 commit comments