ESPHome: esphome/components/micro_wake_word/streaming_model.cpp Source File

#include "streaming_model.h"


#ifdef USE_ESP32


#include "esphome/core/helpers.h"

#include "esphome/core/log.h"


static const char *const TAG = "micro_wake_word";


namespace esphome::micro_wake_word {


void WakeWordModel::log_model_config() {

  ESP_LOGCONFIG(TAG,

                "    - Wake Word: %s\n"

                "      Probability cutoff: %.2f\n"

                "      Sliding window size: %d",

                this->wake_word_.c_str(), this->probability_cutoff_ / 255.0f, this->sliding_window_size_);

}


void VADModel::log_model_config() {

  ESP_LOGCONFIG(TAG,

                "    - VAD Model\n"

                "      Probability cutoff: %.2f\n"

                "      Sliding window size: %d",

                this->probability_cutoff_ / 255.0f, this->sliding_window_size_);

}


bool StreamingModel::load_model_() {

  RAMAllocator<uint8_t> arena_allocator;


  if (this->var_arena_ == nullptr) {

    this->var_arena_ = arena_allocator.allocate(STREAMING_MODEL_VARIABLE_ARENA_SIZE);

    if (this->var_arena_ == nullptr) {

      ESP_LOGE(TAG, "Could not allocate the streaming model's variable tensor arena.");

      return false;

    }

    this->ma_ = tflite::MicroAllocator::Create(this->var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);

    this->mrv_ = tflite::MicroResourceVariables::Create(this->ma_, 20);

  }


  const tflite::Model *model = tflite::GetModel(this->model_start_);

  if (model->version() != TFLITE_SCHEMA_VERSION) {

    ESP_LOGE(TAG, "Streaming model's schema is not supported");

    return false;

  }


  // Probe for the actual required tensor arena size if not yet determined

  if (!this->tensor_arena_size_probed_) {

    size_t probed_size = this->probe_arena_size_();

    if (probed_size > 0) {

      ESP_LOGD(TAG, "Probed tensor arena size: %zu bytes", probed_size);

      this->tensor_arena_size_ = probed_size;

    } else {

      ESP_LOGW(TAG, "Arena size probe failed, using manifest size: %zu bytes", this->tensor_arena_size_);

    }

    this->tensor_arena_size_probed_ = true;

  }


  if (this->tensor_arena_ == nullptr) {

    this->tensor_arena_ = arena_allocator.allocate(this->tensor_arena_size_);

    if (this->tensor_arena_ == nullptr) {

      ESP_LOGE(TAG, "Could not allocate the streaming model's tensor arena.");

      return false;

    }

  }


  if (this->interpreter_ == nullptr) {

    this->interpreter_ =

        make_unique<tflite::MicroInterpreter>(tflite::GetModel(this->model_start_), this->streaming_op_resolver_,

                                              this->tensor_arena_, this->tensor_arena_size_, this->mrv_);

    if (this->interpreter_->AllocateTensors() != kTfLiteOk) {

      ESP_LOGE(TAG, "Failed to allocate tensors for the streaming model");

      return false;

    }


    // Verify input tensor matches expected values

    // Dimension 3 will represent the first layer stride, so skip it may vary

    TfLiteTensor *input = this->interpreter_->input(0);

    if ((input->dims->size != 3) || (input->dims->data[0] != 1) ||

        (input->dims->data[2] != PREPROCESSOR_FEATURE_SIZE)) {

      ESP_LOGE(TAG, "Streaming model tensor input dimensions has improper dimensions.");

      return false;

    }


    if (input->type != kTfLiteInt8) {

      ESP_LOGE(TAG, "Streaming model tensor input is not int8.");

      return false;

    }


    // Verify output tensor matches expected values

    TfLiteTensor *output = this->interpreter_->output(0);

    if ((output->dims->size != 2) || (output->dims->data[0] != 1) || (output->dims->data[1] != 1)) {

      ESP_LOGE(TAG, "Streaming model tensor output dimension is not 1x1.");

      return false;

    }


    if (output->type != kTfLiteUInt8) {

      ESP_LOGE(TAG, "Streaming model tensor output is not uint8.");

      return false;

    }

  }


  this->loaded_ = true;

  this->reset_probabilities();

  return true;

}


size_t StreamingModel::probe_arena_size_() {

  RAMAllocator<uint8_t> arena_allocator;


  // Try with the manifest size first, then escalates to 1.5, then 2x if it fails. Different platforms and different

  // versions of the esp-nn library require different amounts of memory, so the manifest size may not always be correct,

  // and probing allows us to find the actual required size for the current build and platform. Aligns test sizes to 16

  // bytes.

  size_t attempt_sizes[] = {(this->tensor_arena_size_ + 15) & ~15, (this->tensor_arena_size_ * 3 / 2 + 15) & ~15,

                            (this->tensor_arena_size_ * 2 + 15) & ~15};


  for (size_t attempt_size : attempt_sizes) {

    uint8_t *probe_arena = arena_allocator.allocate(attempt_size);

    if (probe_arena == nullptr) {

      continue;

    }


    // Verify the model works at all with this arena size

    auto probe_interpreter = make_unique<tflite::MicroInterpreter>(

        tflite::GetModel(this->model_start_), this->streaming_op_resolver_, probe_arena, attempt_size, this->mrv_);


    if (probe_interpreter->AllocateTensors() != kTfLiteOk) {

      probe_interpreter.reset();

      arena_allocator.deallocate(probe_arena, attempt_size);

      this->ma_ = tflite::MicroAllocator::Create(this->var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);

      this->mrv_ = tflite::MicroResourceVariables::Create(this->ma_, 20);

      continue;

    }


    // Try to shrink the arena. Start with arena_used_bytes() + 16 (rounded to 16-byte alignment).

    // If that works, use it. Otherwise, try midpoints between that and the full size until one succeeds.

    size_t lower = (probe_interpreter->arena_used_bytes() + 16 + 15) & ~15;

    probe_interpreter.reset();

    this->ma_ = tflite::MicroAllocator::Create(this->var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);

    this->mrv_ = tflite::MicroResourceVariables::Create(this->ma_, 20);


    size_t upper = attempt_size;


    while (lower < upper) {

      auto test_interpreter = make_unique<tflite::MicroInterpreter>(

          tflite::GetModel(this->model_start_), this->streaming_op_resolver_, probe_arena, lower, this->mrv_);


      bool ok = test_interpreter->AllocateTensors() == kTfLiteOk;


      test_interpreter.reset();

      this->ma_ = tflite::MicroAllocator::Create(this->var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);

      this->mrv_ = tflite::MicroResourceVariables::Create(this->ma_, 20);


      if (ok) {

        // Found a working size smaller than the full arena

        upper = lower + 16;  // Pad by 16 bytes to be safe for future allocations

        break;

      }


      // Try the midpoint between current attempt and full size

      lower = ((lower + upper) / 2 + 15) & ~15;

    }


    arena_allocator.deallocate(probe_arena, attempt_size);

    return upper;

  }


  return 0;

}


void StreamingModel::unload_model() {

  this->interpreter_.reset();


  RAMAllocator<uint8_t> arena_allocator;


  if (this->tensor_arena_ != nullptr) {

    arena_allocator.deallocate(this->tensor_arena_, this->tensor_arena_size_);

    this->tensor_arena_ = nullptr;

  }


  if (this->var_arena_ != nullptr) {

    arena_allocator.deallocate(this->var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);

    this->var_arena_ = nullptr;

  }


  this->loaded_ = false;

}


bool StreamingModel::perform_streaming_inference(const int8_t features[PREPROCESSOR_FEATURE_SIZE]) {

  if (this->enabled_ && !this->loaded_) {

    // Model is enabled but isn't loaded

    if (!this->load_model_()) {

      return false;

    }

  }


  if (!this->enabled_ && this->loaded_) {

    // Model is disabled but still loaded

    this->unload_model();

    return true;

  }


  if (this->loaded_) {

    TfLiteTensor *input = this->interpreter_->input(0);


    uint8_t stride = this->interpreter_->input(0)->dims->data[1];

    this->current_stride_step_ = this->current_stride_step_ % stride;


    std::memmove(

        (int8_t *) (tflite::GetTensorData<int8_t>(input)) + PREPROCESSOR_FEATURE_SIZE * this->current_stride_step_,

        features, PREPROCESSOR_FEATURE_SIZE);

    ++this->current_stride_step_;


    if (this->current_stride_step_ >= stride) {

      TfLiteStatus invoke_status = this->interpreter_->Invoke();

      if (invoke_status != kTfLiteOk) {

        ESP_LOGW(TAG, "Streaming interpreter invoke failed");

        return false;

      }


      TfLiteTensor *output = this->interpreter_->output(0);


      ++this->last_n_index_;

      if (this->last_n_index_ == this->sliding_window_size_)

        this->last_n_index_ = 0;

      this->recent_streaming_probabilities_[this->last_n_index_] = output->data.uint8[0];  // probability;

      this->unprocessed_probability_status_ = true;

    }

    if (this->recent_streaming_probabilities_[this->last_n_index_] < this->probability_cutoff_) {

      // Only increment ignore windows if less than the probability cutoff; this forces the model to "cool-off" from a

      // previous detection and calling ``reset_probabilities`` so it avoids duplicate detections

      this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);

    }

  }

  return true;

}


void StreamingModel::reset_probabilities() {

  for (auto &prob : this->recent_streaming_probabilities_) {

    prob = 0;

  }

  this->ignore_windows_ = -MIN_SLICES_BEFORE_DETECTION;

}


WakeWordModel::WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t default_probability_cutoff,

                             size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size,

                             bool default_enabled, bool internal_only) {

  this->id_ = id;

  this->model_start_ = model_start;

  this->default_probability_cutoff_ = default_probability_cutoff;

  this->probability_cutoff_ = default_probability_cutoff;

  this->sliding_window_size_ = sliding_window_average_size;

  this->recent_streaming_probabilities_.resize(sliding_window_average_size, 0);

  this->wake_word_ = wake_word;

  this->tensor_arena_size_ = tensor_arena_size;

  this->register_streaming_ops_(this->streaming_op_resolver_);

  this->current_stride_step_ = 0;

  this->internal_only_ = internal_only;


  this->pref_ = global_preferences->make_preference<bool>(fnv1_hash(id));

  bool enabled;

  if (this->pref_.load(&enabled)) {

    // Use the enabled state loaded from flash

    this->enabled_ = enabled;

  } else {

    // If no state saved, then use the default

    this->enabled_ = default_enabled;

  }

};


void WakeWordModel::enable() {

  this->enabled_ = true;

  if (!this->internal_only_) {

    this->pref_.save(&this->enabled_);

  }

}


void WakeWordModel::disable() {

  this->enabled_ = false;

  if (!this->internal_only_) {

    this->pref_.save(&this->enabled_);

  }

}


DetectionEvent WakeWordModel::determine_detected() {

  DetectionEvent detection_event;

  detection_event.wake_word = &this->wake_word_;

  detection_event.max_probability = 0;

  detection_event.average_probability = 0;


  if ((this->ignore_windows_ < 0) || !this->enabled_) {

    detection_event.detected = false;

    return detection_event;

  }


  uint32_t sum = 0;

  for (auto &prob : this->recent_streaming_probabilities_) {

    detection_event.max_probability = std::max(detection_event.max_probability, prob);

    sum += prob;

  }


  detection_event.average_probability = sum / this->sliding_window_size_;

  detection_event.detected = sum > this->probability_cutoff_ * this->sliding_window_size_;


  this->unprocessed_probability_status_ = false;

  return detection_event;

}


VADModel::VADModel(const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_size,

                   size_t tensor_arena_size) {

  this->model_start_ = model_start;

  this->default_probability_cutoff_ = default_probability_cutoff;

  this->probability_cutoff_ = default_probability_cutoff;

  this->sliding_window_size_ = sliding_window_size;

  this->recent_streaming_probabilities_.resize(sliding_window_size, 0);

  this->tensor_arena_size_ = tensor_arena_size;

  this->register_streaming_ops_(this->streaming_op_resolver_);

}


DetectionEvent VADModel::determine_detected() {

  DetectionEvent detection_event;

  detection_event.max_probability = 0;

  detection_event.average_probability = 0;


  if (!this->enabled_) {

    // We disabled the VAD model for some reason... so we shouldn't block wake words from being detected

    detection_event.detected = true;

    return detection_event;

  }


  uint32_t sum = 0;

  for (auto &prob : this->recent_streaming_probabilities_) {

    detection_event.max_probability = std::max(detection_event.max_probability, prob);

    sum += prob;

  }


  detection_event.average_probability = sum / this->sliding_window_size_;

  detection_event.detected = sum > (this->probability_cutoff_ * this->sliding_window_size_);


  return detection_event;

}


bool StreamingModel::register_streaming_ops_(tflite::MicroMutableOpResolver<20> &op_resolver) {

  if (op_resolver.AddCallOnce() != kTfLiteOk)

    return false;

  if (op_resolver.AddVarHandle() != kTfLiteOk)

    return false;

  if (op_resolver.AddReshape() != kTfLiteOk)

    return false;

  if (op_resolver.AddReadVariable() != kTfLiteOk)

    return false;

  if (op_resolver.AddStridedSlice() != kTfLiteOk)

    return false;

  if (op_resolver.AddConcatenation() != kTfLiteOk)

    return false;

  if (op_resolver.AddAssignVariable() != kTfLiteOk)

    return false;

  if (op_resolver.AddConv2D() != kTfLiteOk)

    return false;

  if (op_resolver.AddMul() != kTfLiteOk)

    return false;

  if (op_resolver.AddAdd() != kTfLiteOk)

    return false;

  if (op_resolver.AddMean() != kTfLiteOk)

    return false;

  if (op_resolver.AddFullyConnected() != kTfLiteOk)

    return false;

  if (op_resolver.AddLogistic() != kTfLiteOk)

    return false;

  if (op_resolver.AddQuantize() != kTfLiteOk)

    return false;

  if (op_resolver.AddDepthwiseConv2D() != kTfLiteOk)

    return false;

  if (op_resolver.AddAveragePool2D() != kTfLiteOk)

    return false;

  if (op_resolver.AddMaxPool2D() != kTfLiteOk)

    return false;

  if (op_resolver.AddPad() != kTfLiteOk)

    return false;

  if (op_resolver.AddPack() != kTfLiteOk)

    return false;

  if (op_resolver.AddSplitV() != kTfLiteOk)

    return false;


  return true;

}


}  // namespace esphome::micro_wake_word


#endif

esphome::ESPPreferenceObject::save
bool save(const T *src)
Definition preference_backend.h:43

esphome::ESPPreferenceObject::load
bool load(T *dest)
Definition preference_backend.h:49

esphome::RAMAllocator
An STL allocator that uses SPI or internal RAM.
Definition helpers.h:2053

esphome::RAMAllocator::deallocate
void deallocate(T *p, size_t n)
Definition helpers.h:2110

esphome::RAMAllocator::allocate
T * allocate(size_t n)
Definition helpers.h:2080

esphome::micro_wake_word::StreamingModel::ma_
tflite::MicroAllocator * ma_
Definition streaming_model.h:94

esphome::micro_wake_word::StreamingModel::load_model_
bool load_model_()
Allocates tensor and variable arenas and sets up the model interpreter.
Definition streaming_model.cpp:28

esphome::micro_wake_word::StreamingModel::tensor_arena_
uint8_t * tensor_arena_
Definition streaming_model.h:90

esphome::micro_wake_word::StreamingModel::current_stride_step_
uint8_t current_stride_step_
Definition streaming_model.h:78

esphome::micro_wake_word::StreamingModel::ignore_windows_
int16_t ignore_windows_
Definition streaming_model.h:79

esphome::micro_wake_word::StreamingModel::sliding_window_size_
size_t sliding_window_size_
Definition streaming_model.h:83

esphome::micro_wake_word::StreamingModel::tensor_arena_size_probed_
bool tensor_arena_size_probed_
Definition streaming_model.h:76

esphome::micro_wake_word::StreamingModel::interpreter_
std::unique_ptr< tflite::MicroInterpreter > interpreter_
Definition streaming_model.h:92

esphome::micro_wake_word::StreamingModel::default_probability_cutoff_
uint8_t default_probability_cutoff_
Definition streaming_model.h:81

esphome::micro_wake_word::StreamingModel::streaming_op_resolver_
tflite::MicroMutableOpResolver< 20 > streaming_op_resolver_
Definition streaming_model.h:72

esphome::micro_wake_word::StreamingModel::register_streaming_ops_
bool register_streaming_ops_(tflite::MicroMutableOpResolver< 20 > &op_resolver)
Returns true if successfully registered the streaming model's TensorFlow operations.
Definition streaming_model.cpp:344

esphome::micro_wake_word::StreamingModel::loaded_
bool loaded_
Definition streaming_model.h:74

esphome::micro_wake_word::StreamingModel::reset_probabilities
void reset_probabilities()
Sets all recent_streaming_probabilities to 0 and resets the ignore window count.
Definition streaming_model.cpp:239

esphome::micro_wake_word::StreamingModel::tensor_arena_size_
size_t tensor_arena_size_
Definition streaming_model.h:86

esphome::micro_wake_word::StreamingModel::recent_streaming_probabilities_
std::vector< uint8_t > recent_streaming_probabilities_
Definition streaming_model.h:87

esphome::micro_wake_word::StreamingModel::probe_arena_size_
size_t probe_arena_size_()
Probes the actual required tensor arena size by trial allocation.
Definition streaming_model.cpp:108

esphome::micro_wake_word::StreamingModel::last_n_index_
size_t last_n_index_
Definition streaming_model.h:85

esphome::micro_wake_word::StreamingModel::var_arena_
uint8_t * var_arena_
Definition streaming_model.h:91

esphome::micro_wake_word::StreamingModel::unprocessed_probability_status_
bool unprocessed_probability_status_
Definition streaming_model.h:77

esphome::micro_wake_word::StreamingModel::mrv_
tflite::MicroResourceVariables * mrv_
Definition streaming_model.h:93

esphome::micro_wake_word::StreamingModel::perform_streaming_inference
bool perform_streaming_inference(const int8_t features[PREPROCESSOR_FEATURE_SIZE])
Definition streaming_model.cpp:190

esphome::micro_wake_word::StreamingModel::probability_cutoff_
uint8_t probability_cutoff_
Definition streaming_model.h:82

esphome::micro_wake_word::StreamingModel::unload_model
void unload_model()
Destroys the TFLite interpreter and frees the tensor and variable arenas' memory.
Definition streaming_model.cpp:172

esphome::micro_wake_word::StreamingModel::enabled_
bool enabled_
Definition streaming_model.h:75

esphome::micro_wake_word::StreamingModel::model_start_
const uint8_t * model_start_
Definition streaming_model.h:89

esphome::micro_wake_word::VADModel::determine_detected
DetectionEvent determine_detected() override
Checks for voice activity by comparing the max probability in the sliding window with the probability...
Definition streaming_model.cpp:321

esphome::micro_wake_word::VADModel::VADModel
VADModel(const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_size, size_t tensor_arena_size)
Definition streaming_model.cpp:310

esphome::micro_wake_word::VADModel::log_model_config
void log_model_config() override
Definition streaming_model.cpp:20

esphome::micro_wake_word::WakeWordModel::enable
void enable() override
Enable the model and save to flash. The next performing_streaming_inference call will load it.
Definition streaming_model.cpp:272

esphome::micro_wake_word::WakeWordModel::id_
std::string id_
Definition streaming_model.h:135

esphome::micro_wake_word::WakeWordModel::determine_detected
DetectionEvent determine_detected() override
Checks for the wake word by comparing the mean probability in the sliding window with the probability...
Definition streaming_model.cpp:286

esphome::micro_wake_word::WakeWordModel::log_model_config
void log_model_config() override
Definition streaming_model.cpp:12

esphome::micro_wake_word::WakeWordModel::wake_word_
std::string wake_word_
Definition streaming_model.h:136

esphome::micro_wake_word::WakeWordModel::pref_
ESPPreferenceObject pref_
Definition streaming_model.h:141

esphome::micro_wake_word::WakeWordModel::WakeWordModel
WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size, bool default_enabled, bool internal_only)
Constructs a wake word model object.
Definition streaming_model.cpp:246

esphome::micro_wake_word::WakeWordModel::disable
void disable() override
Disable the model and save to flash. The next performing_streaming_inference call will unload it.
Definition streaming_model.cpp:279

esphome::micro_wake_word::WakeWordModel::internal_only_
bool internal_only_
Definition streaming_model.h:139

id
uint16_t id
Definition dns_server_esp32_idf.cpp:0

helpers.h

log.h

esphome::micro_wake_word
Definition automation.h:8

esphome::global_preferences
ESPPreferences * global_preferences
Definition preferences.cpp:209

esphome::fnv1_hash
uint32_t fnv1_hash(const char *str)
Calculate a FNV-1 hash of str.
Definition helpers.cpp:160

uint32_t
static void uint32_t
Definition crash_handler.cpp:141

streaming_model.h

esphome::Preferences::make_preference
ESPPreferenceObject make_preference(size_t, uint32_t, bool)
Definition preferences.h:24

esphome::micro_wake_word::DetectionEvent
Definition streaming_model.h:18

esphome::micro_wake_word::DetectionEvent::detected
bool detected
Definition streaming_model.h:20

esphome::micro_wake_word::DetectionEvent::wake_word
std::string * wake_word
Definition streaming_model.h:19

esphome::micro_wake_word::DetectionEvent::max_probability
uint8_t max_probability
Definition streaming_model.h:23

esphome::micro_wake_word::DetectionEvent::average_probability
uint8_t average_probability
Definition streaming_model.h:24