18static const char *
const TAG =
"micro_wake_word";
20static const ssize_t DETECTION_QUEUE_LENGTH = 5;
22static const size_t DATA_TIMEOUT_MS = 50;
24static const uint32_t RING_BUFFER_DURATION_MS = 120;
26#ifdef CONFIG_IDF_TARGET_ESP32P4
29static const uint32_t INFERENCE_TASK_STACK_SIZE = 8192;
31static const uint32_t INFERENCE_TASK_STACK_SIZE = 3072;
33static const UBaseType_t INFERENCE_TASK_PRIORITY = 3;
55static const LogString *micro_wake_word_state_to_string(
State state) {
58 return LOG_STR(
"STARTING");
60 return LOG_STR(
"DETECTING_WAKE_WORD");
62 return LOG_STR(
"STOPPING");
64 return LOG_STR(
"STOPPED");
66 return LOG_STR(
"UNKNOWN");
71 ESP_LOGCONFIG(TAG,
"microWakeWord:");
72 ESP_LOGCONFIG(TAG,
" models:");
74 model->log_model_config();
76#ifdef USE_MICRO_WAKE_WORD_VAD
85 this->
frontend_config_.filterbank.lower_band_limit = FILTERBANK_LOWER_BAND_LIMIT;
86 this->
frontend_config_.filterbank.upper_band_limit = FILTERBANK_UPPER_BAND_LIMIT;
87 this->
frontend_config_.noise_reduction.smoothing_bits = NOISE_REDUCTION_SMOOTHING_BITS;
88 this->
frontend_config_.noise_reduction.even_smoothing = NOISE_REDUCTION_EVEN_SMOOTHING;
89 this->
frontend_config_.noise_reduction.odd_smoothing = NOISE_REDUCTION_ODD_SMOOTHING;
90 this->
frontend_config_.noise_reduction.min_signal_remaining = NOISE_REDUCTION_MIN_SIGNAL_REMAINING;
91 this->
frontend_config_.pcan_gain_control.enable_pcan = PCAN_GAIN_CONTROL_ENABLE_PCAN;
92 this->
frontend_config_.pcan_gain_control.strength = PCAN_GAIN_CONTROL_STRENGTH;
94 this->
frontend_config_.pcan_gain_control.gain_bits = PCAN_GAIN_CONTROL_GAIN_BITS;
100 ESP_LOGE(TAG,
"Failed to create event group");
107 ESP_LOGE(TAG,
"Failed to create detection event queue");
116 std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->
ring_buffer_.lock();
121 if (temp_ring_buffer->write_without_replacement(data.data(), data.size(), 0,
false) == 0) {
128#ifdef USE_OTA_STATE_LISTENER
133#ifdef USE_OTA_STATE_LISTENER
153 std::unique_ptr<audio::RingBufferAudioSource> audio_source;
154 int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE];
158 const size_t ring_buffer_size =
159 (stream_info.ms_to_bytes(RING_BUFFER_DURATION_MS) / bytes_per_frame) * bytes_per_frame;
161 if (temp_ring_buffer ==
nullptr) {
165 static_cast<uint8_t
>(bytes_per_frame));
166 if (audio_source ==
nullptr) {
181 audio_source->clear_buffered_data();
185 audio_source->fill(pdMS_TO_TICKS(DATA_TIMEOUT_MS),
false);
190 while (audio_source->available() >=
sizeof(int16_t)) {
191 const size_t samples_available = audio_source->available() /
sizeof(int16_t);
192 const int16_t *audio_data =
reinterpret_cast<const int16_t *
>(audio_source->data());
194 size_t processed_samples = 0;
195 const bool feature_generated =
196 this_mww->
generate_features_(audio_data, samples_available, features_buffer, &processed_samples);
197 audio_source->consume(processed_samples *
sizeof(int16_t));
199 if (feature_generated) {
220 vTaskSuspend(
nullptr);
224 std::vector<WakeWordModel *> external_wake_word_models;
226 if (!model->get_internal_only()) {
227 external_wake_word_models.push_back(model);
230 return external_wake_word_models;
235#ifdef USE_MICRO_WAKE_WORD_VAD
237 size_t tensor_arena_size) {
238 this->
vad_model_ = make_unique<VADModel>(model_start, probability_cutoff, sliding_window_size, tensor_arena_size);
258 xEventGroupClearBits(this->
event_group_, EventGroupBits::ERROR_MEMORY);
259 ESP_LOGE(TAG,
"Encountered an error allocating buffers");
263 xEventGroupClearBits(this->
event_group_, EventGroupBits::ERROR_INFERENCE);
264 ESP_LOGE(TAG,
"Encountered an error while performing an inference");
268 xEventGroupClearBits(this->
event_group_, EventGroupBits::WARNING_FULL_RING_BUFFER);
269 ESP_LOGW(TAG,
"Not enough free bytes in ring buffer to store incoming audio data. Resetting the ring buffer. Wake "
270 "word detection accuracy will temporarily be reduced.");
274 ESP_LOGD(TAG,
"Inference task has started, attempting to allocate memory for buffers");
275 xEventGroupClearBits(this->
event_group_, EventGroupBits::TASK_STARTING);
279 ESP_LOGD(TAG,
"Inference task is running");
281 xEventGroupClearBits(this->
event_group_, EventGroupBits::TASK_RUNNING);
286 ESP_LOGD(TAG,
"Inference task is stopping, deallocating buffers");
287 xEventGroupClearBits(this->
event_group_, EventGroupBits::TASK_STOPPING);
291 ESP_LOGD(TAG,
"Inference task is finished, freeing task resources");
320 (
void *)
this, INFERENCE_TASK_PRIORITY, this->task_stack_in_psram_)) {
330 ESP_LOGD(TAG,
"Wake word model predicts '%s', but VAD model doesn't.", detection_event.
wake_word->c_str());
332 constexpr float uint8_to_float_divisor =
334 ESP_LOGD(TAG,
"Detected '%s' with sliding average probability is %.2f and max probability is %.2f",
355 ESP_LOGW(TAG,
"Wake word detection can't start as the component hasn't been setup yet");
360 ESP_LOGW(TAG,
"Wake word component is marked as failed. Please check setup logs");
365 ESP_LOGW(TAG,
"Wake word detection is already running");
369 ESP_LOGD(TAG,
"Starting wake word detection");
379 ESP_LOGD(TAG,
"Stopping wake word detection");
386 if (this->
state_ != state) {
387 ESP_LOGD(TAG,
"State changed from %s to %s", LOG_STR_ARG(micro_wake_word_state_to_string(this->
state_)),
388 LOG_STR_ARG(micro_wake_word_state_to_string(
state)));
394 int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE],
size_t *processed_samples) {
395 *processed_samples = 0;
396 struct FrontendOutput frontend_output =
397 FrontendProcessSamples(&this->
frontend_state_, audio_buffer, samples_available, processed_samples);
399 if (frontend_output.size == 0) {
403 for (
size_t i = 0; i < frontend_output.size; ++i) {
418 constexpr int32_t value_scale = 256;
419 constexpr int32_t value_div = 666;
420 int32_t value = ((frontend_output.values[i] * value_scale) + (value_div / 2)) / value_div;
423 features_buffer[i] =
static_cast<int8_t
>(clamp<int32_t>(value, INT8_MIN, INT8_MAX));
430#ifdef USE_MICRO_WAKE_WORD_VAD
437 if (model->get_unprocessed_probability_status()) {
441#ifdef USE_MICRO_WAKE_WORD_VAD
449 model->reset_probabilities();
450#ifdef USE_MICRO_WAKE_WORD_VAD
463 model->unload_model();
465#ifdef USE_MICRO_WAKE_WORD_VAD
475 success = success & model->perform_streaming_inference(audio_features);
477#ifdef USE_MICRO_WAKE_WORD_VAD
478 success = success & this->
vad_model_->perform_streaming_inference(audio_features);
void wake_loop_threadsafe()
Wake the main event loop from another thread or callback.
void mark_failed()
Mark this component as failed.
void status_momentary_error(const char *name, uint32_t length=5000)
Set error status flag and automatically clear it after a timeout.
bool create(TaskFunction_t fn, const char *name, uint32_t stack_size, void *param, UBaseType_t priority, bool use_psram)
Allocate stack and create task.
bool is_created() const
Check if the task has been created and not yet destroyed.
TaskHandle_t get_handle() const
Get the FreeRTOS task handle.
void deallocate()
Delete the task (if running) and free the stack buffer.
void trigger(const Ts &...x) ESPHOME_ALWAYS_INLINE
Inform the parent automation that the event has triggered.
size_t frames_to_bytes(uint32_t frames) const
Converts frames to bytes.
uint32_t get_sample_rate() const
static std::unique_ptr< RingBufferAudioSource > create(std::shared_ptr< ring_buffer::RingBuffer > ring_buffer, size_t max_fill_bytes, uint8_t alignment_bytes=1)
Creates a new ring-buffer-backed audio source after validating its parameters.
void resume_task_()
Resumes the inference task.
microphone::MicrophoneSource * microphone_source_
QueueHandle_t detection_queue_
static void inference_task(void *params)
void process_probabilities_()
Processes any new probabilities for each model.
EventGroupHandle_t event_group_
void set_state_(State state)
std::vector< WakeWordModel * > wake_word_models_
void suspend_task_()
Suspends the inference task.
float get_setup_priority() const override
Trigger< std::string > wake_word_detected_trigger_
void add_wake_word_model(WakeWordModel *model)
bool generate_features_(const int16_t *audio_buffer, size_t samples_available, int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE], size_t *processed_samples)
Generates a spectrogram feature from an input buffer of audio samples.
void dump_config() override
bool stop_after_detection_
uint8_t features_step_size_
bool update_model_probabilities_(const int8_t audio_features[PREPROCESSOR_FEATURE_SIZE])
Runs an inference with each model using the new spectrogram features.
struct FrontendConfig frontend_config_
std::unique_ptr< VADModel > vad_model_
std::weak_ptr< ring_buffer::RingBuffer > ring_buffer_
void add_vad_model(const uint8_t *model_start, uint8_t probability_cutoff, size_t sliding_window_size, size_t tensor_arena_size)
void unload_models_()
Deletes each model's TFLite interpreters and frees tensor arena memory.
std::vector< WakeWordModel * > get_wake_words()
StaticTask inference_task_
struct FrontendState frontend_state_
void on_ota_global_state(ota::OTAState state, float progress, uint8_t error, ota::OTAComponent *comp) override
void add_data_callback(F &&data_callback)
audio::AudioStreamInfo get_audio_stream_info()
Gets the AudioStreamInfo of the data after processing.
void add_global_state_listener(OTAGlobalStateListener *listener)
static std::unique_ptr< RingBuffer > create(size_t len, MemoryPreference preference=MemoryPreference::EXTERNAL_FIRST)
@ COMMAND_RESET_RING_BUFFER
@ WARNING_FULL_RING_BUFFER
OTAGlobalCallback * get_global_ota_callback()
constexpr float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Application App
Global storage of Application pointer - only one Application can exist.
uint8_t average_probability