17namespace micro_wake_word {
19static const char *
const TAG =
"micro_wake_word";
21static const ssize_t DETECTION_QUEUE_LENGTH = 5;
23static const size_t DATA_TIMEOUT_MS = 50;
25static const uint32_t RING_BUFFER_DURATION_MS = 120;
27static const uint32_t INFERENCE_TASK_STACK_SIZE = 3072;
28static const UBaseType_t INFERENCE_TASK_PRIORITY = 3;
49static const LogString *micro_wake_word_state_to_string(
State state) {
52 return LOG_STR(
"STARTING");
54 return LOG_STR(
"DETECTING_WAKE_WORD");
56 return LOG_STR(
"STOPPING");
58 return LOG_STR(
"STOPPED");
60 return LOG_STR(
"UNKNOWN");
65 ESP_LOGCONFIG(TAG,
"microWakeWord:");
66 ESP_LOGCONFIG(TAG,
" models:");
68 model->log_model_config();
70#ifdef USE_MICRO_WAKE_WORD_VAD
79 this->
frontend_config_.filterbank.lower_band_limit = FILTERBANK_LOWER_BAND_LIMIT;
80 this->
frontend_config_.filterbank.upper_band_limit = FILTERBANK_UPPER_BAND_LIMIT;
81 this->
frontend_config_.noise_reduction.smoothing_bits = NOISE_REDUCTION_SMOOTHING_BITS;
82 this->
frontend_config_.noise_reduction.even_smoothing = NOISE_REDUCTION_EVEN_SMOOTHING;
83 this->
frontend_config_.noise_reduction.odd_smoothing = NOISE_REDUCTION_ODD_SMOOTHING;
84 this->
frontend_config_.noise_reduction.min_signal_remaining = NOISE_REDUCTION_MIN_SIGNAL_REMAINING;
85 this->
frontend_config_.pcan_gain_control.enable_pcan = PCAN_GAIN_CONTROL_ENABLE_PCAN;
86 this->
frontend_config_.pcan_gain_control.strength = PCAN_GAIN_CONTROL_STRENGTH;
88 this->
frontend_config_.pcan_gain_control.gain_bits = PCAN_GAIN_CONTROL_GAIN_BITS;
94 ESP_LOGE(TAG,
"Failed to create event group");
101 ESP_LOGE(TAG,
"Failed to create detection event queue");
110 std::shared_ptr<RingBuffer> temp_ring_buffer = this->
ring_buffer_.lock();
112 size_t bytes_free = temp_ring_buffer->free();
114 if (bytes_free < data.size()) {
116 temp_ring_buffer->reset();
118 temp_ring_buffer->write((
void *) data.data(), data.size());
141 const size_t new_bytes_to_process =
143 std::unique_ptr<audio::AudioSourceTransferBuffer> audio_buffer;
144 int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE];
150 if (audio_buffer ==
nullptr) {
159 if (temp_ring_buffer.use_count() == 0) {
162 audio_buffer->set_source(temp_ring_buffer);
171 audio_buffer->transfer_data_from_source(pdMS_TO_TICKS(DATA_TIMEOUT_MS));
173 if (audio_buffer->available() < new_bytes_to_process) {
180 (int16_t *) audio_buffer->get_buffer_start(), audio_buffer->available() /
sizeof(int16_t), features_buffer);
181 audio_buffer->decrease_buffer_length(processed_samples *
sizeof(int16_t));
209 std::vector<WakeWordModel *> external_wake_word_models;
211 if (!model->get_internal_only()) {
212 external_wake_word_models.push_back(model);
215 return external_wake_word_models;
220#ifdef USE_MICRO_WAKE_WORD_VAD
222 size_t tensor_arena_size) {
223 this->
vad_model_ = make_unique<VADModel>(model_start, probability_cutoff, sliding_window_size, tensor_arena_size);
240 uint32_t event_group_bits = xEventGroupGetBits(this->
event_group_);
243 xEventGroupClearBits(this->
event_group_, EventGroupBits::ERROR_MEMORY);
244 ESP_LOGE(TAG,
"Encountered an error allocating buffers");
248 xEventGroupClearBits(this->
event_group_, EventGroupBits::ERROR_INFERENCE);
249 ESP_LOGE(TAG,
"Encountered an error while performing an inference");
253 xEventGroupClearBits(this->
event_group_, EventGroupBits::WARNING_FULL_RING_BUFFER);
254 ESP_LOGW(TAG,
"Not enough free bytes in ring buffer to store incoming audio data. Resetting the ring buffer. Wake "
255 "word detection accuracy will temporarily be reduced.");
259 ESP_LOGD(TAG,
"Inference task has started, attempting to allocate memory for buffers");
260 xEventGroupClearBits(this->
event_group_, EventGroupBits::TASK_STARTING);
264 ESP_LOGD(TAG,
"Inference task is running");
266 xEventGroupClearBits(this->
event_group_, EventGroupBits::TASK_RUNNING);
271 ESP_LOGD(TAG,
"Inference task is stopping, deallocating buffers");
272 xEventGroupClearBits(this->
event_group_, EventGroupBits::TASK_STOPPING);
276 ESP_LOGD(TAG,
"Inference task is finished, freeing task resources");
302 "Failed to allocate buffers for spectrogram feature processor, attempting again in 1 second", 1000);
319 ESP_LOGD(TAG,
"Wake word model predicts '%s', but VAD model doesn't.", detection_event.
wake_word->c_str());
321 constexpr float uint8_to_float_divisor =
323 ESP_LOGD(TAG,
"Detected '%s' with sliding average probability is %.2f and max probability is %.2f",
344 ESP_LOGW(TAG,
"Wake word detection can't start as the component hasn't been setup yet");
349 ESP_LOGW(TAG,
"Wake word component is marked as failed. Please check setup logs");
354 ESP_LOGW(TAG,
"Wake word detection is already running");
358 ESP_LOGD(TAG,
"Starting wake word detection");
368 ESP_LOGD(TAG,
"Stopping wake word detection");
375 if (this->
state_ != state) {
376 ESP_LOGD(TAG,
"State changed from %s to %s", LOG_STR_ARG(micro_wake_word_state_to_string(this->
state_)),
377 LOG_STR_ARG(micro_wake_word_state_to_string(
state)));
383 int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE]) {
384 size_t processed_samples = 0;
385 struct FrontendOutput frontend_output =
386 FrontendProcessSamples(&this->
frontend_state_, audio_buffer, samples_available, &processed_samples);
388 for (
size_t i = 0; i < frontend_output.size; ++i) {
403 constexpr int32_t value_scale = 256;
404 constexpr int32_t value_div = 666;
405 int32_t value = ((frontend_output.values[i] * value_scale) + (value_div / 2)) / value_div;
408 features_buffer[i] =
static_cast<int8_t
>(clamp<int32_t>(value, INT8_MIN, INT8_MAX));
411 return processed_samples;
415#ifdef USE_MICRO_WAKE_WORD_VAD
422 if (model->get_unprocessed_probability_status()) {
426#ifdef USE_MICRO_WAKE_WORD_VAD
432#if defined(USE_SOCKET_SELECT_SUPPORT) && defined(USE_WAKE_LOOP_THREADSAFE)
436 model->reset_probabilities();
437#ifdef USE_MICRO_WAKE_WORD_VAD
450 model->unload_model();
452#ifdef USE_MICRO_WAKE_WORD_VAD
462 success = success & model->perform_streaming_inference(audio_features);
464#ifdef USE_MICRO_WAKE_WORD_VAD
465 success = success & this->
vad_model_->perform_streaming_inference(audio_features);
void wake_loop_threadsafe()
Wake the main event loop from a FreeRTOS task Thread-safe, can be called from task context to immedia...
virtual void mark_failed()
Mark this component as failed.
void status_momentary_error(const std::string &name, uint32_t length=5000)
bool status_has_error() const
static std::unique_ptr< RingBuffer > create(size_t len)
void trigger(const Ts &...x)
Inform the parent automation that the event has triggered.
static std::unique_ptr< AudioSourceTransferBuffer > create(size_t buffer_size)
Creates a new source transfer buffer.
size_t ms_to_bytes(uint32_t ms) const
Converts duration to bytes.
uint32_t get_sample_rate() const
void resume_task_()
Resumes the inference task.
microphone::MicrophoneSource * microphone_source_
QueueHandle_t detection_queue_
static void inference_task(void *params)
void process_probabilities_()
Processes any new probabilities for each model.
EventGroupHandle_t event_group_
std::weak_ptr< RingBuffer > ring_buffer_
TaskHandle_t inference_task_handle_
void set_state_(State state)
Trigger< std::string > * wake_word_detected_trigger_
std::vector< WakeWordModel * > wake_word_models_
void suspend_task_()
Suspends the inference task.
float get_setup_priority() const override
void add_wake_word_model(WakeWordModel *model)
void dump_config() override
bool stop_after_detection_
uint8_t features_step_size_
bool update_model_probabilities_(const int8_t audio_features[PREPROCESSOR_FEATURE_SIZE])
Runs an inference with each model using the new spectrogram features.
size_t generate_features_(int16_t *audio_buffer, size_t samples_available, int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE])
Generates spectrogram features from an input buffer of audio samples.
struct FrontendConfig frontend_config_
std::unique_ptr< VADModel > vad_model_
void add_vad_model(const uint8_t *model_start, uint8_t probability_cutoff, size_t sliding_window_size, size_t tensor_arena_size)
void unload_models_()
Deletes each model's TFLite interpreters and frees tensor arena memory.
std::vector< WakeWordModel * > get_wake_words()
struct FrontendState frontend_state_
void add_data_callback(std::function< void(const std::vector< uint8_t > &)> &&data_callback)
audio::AudioStreamInfo get_audio_stream_info()
Gets the AudioStreamInfo of the data after processing.
void add_on_state_callback(std::function< void(OTAState, float, uint8_t, OTAComponent *)> &&callback)
@ WARNING_FULL_RING_BUFFER
OTAGlobalCallback * get_global_ota_callback()
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Providing packet encoding functions for exchanging data with a remote host.
void IRAM_ATTR HOT delay(uint32_t ms)
Application App
Global storage of Application pointer - only one Application can exist.
uint8_t average_probability