ESPHome 2026.5.0-dev
Loading...
Searching...
No Matches
streaming_model.cpp
Go to the documentation of this file.
1#include "streaming_model.h"
2
3#ifdef USE_ESP32
4
6#include "esphome/core/log.h"
7
8static const char *const TAG = "micro_wake_word";
9
10namespace esphome {
11namespace micro_wake_word {
12
14 ESP_LOGCONFIG(TAG,
15 " - Wake Word: %s\n"
16 " Probability cutoff: %.2f\n"
17 " Sliding window size: %d",
18 this->wake_word_.c_str(), this->probability_cutoff_ / 255.0f, this->sliding_window_size_);
19}
20
22 ESP_LOGCONFIG(TAG,
23 " - VAD Model\n"
24 " Probability cutoff: %.2f\n"
25 " Sliding window size: %d",
26 this->probability_cutoff_ / 255.0f, this->sliding_window_size_);
27}
28
30 RAMAllocator<uint8_t> arena_allocator;
31
32 if (this->var_arena_ == nullptr) {
33 this->var_arena_ = arena_allocator.allocate(STREAMING_MODEL_VARIABLE_ARENA_SIZE);
34 if (this->var_arena_ == nullptr) {
35 ESP_LOGE(TAG, "Could not allocate the streaming model's variable tensor arena.");
36 return false;
37 }
38 this->ma_ = tflite::MicroAllocator::Create(this->var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);
39 this->mrv_ = tflite::MicroResourceVariables::Create(this->ma_, 20);
40 }
41
42 const tflite::Model *model = tflite::GetModel(this->model_start_);
43 if (model->version() != TFLITE_SCHEMA_VERSION) {
44 ESP_LOGE(TAG, "Streaming model's schema is not supported");
45 return false;
46 }
47
48 // Probe for the actual required tensor arena size if not yet determined
49 if (!this->tensor_arena_size_probed_) {
50 size_t probed_size = this->probe_arena_size_();
51 if (probed_size > 0) {
52 ESP_LOGD(TAG, "Probed tensor arena size: %zu bytes", probed_size);
53 this->tensor_arena_size_ = probed_size;
54 } else {
55 ESP_LOGW(TAG, "Arena size probe failed, using manifest size: %zu bytes", this->tensor_arena_size_);
56 }
57 this->tensor_arena_size_probed_ = true;
58 }
59
60 if (this->tensor_arena_ == nullptr) {
61 this->tensor_arena_ = arena_allocator.allocate(this->tensor_arena_size_);
62 if (this->tensor_arena_ == nullptr) {
63 ESP_LOGE(TAG, "Could not allocate the streaming model's tensor arena.");
64 return false;
65 }
66 }
67
68 if (this->interpreter_ == nullptr) {
69 this->interpreter_ =
70 make_unique<tflite::MicroInterpreter>(tflite::GetModel(this->model_start_), this->streaming_op_resolver_,
71 this->tensor_arena_, this->tensor_arena_size_, this->mrv_);
72 if (this->interpreter_->AllocateTensors() != kTfLiteOk) {
73 ESP_LOGE(TAG, "Failed to allocate tensors for the streaming model");
74 return false;
75 }
76
77 // Verify input tensor matches expected values
78 // Dimension 3 will represent the first layer stride, so skip it may vary
79 TfLiteTensor *input = this->interpreter_->input(0);
80 if ((input->dims->size != 3) || (input->dims->data[0] != 1) ||
81 (input->dims->data[2] != PREPROCESSOR_FEATURE_SIZE)) {
82 ESP_LOGE(TAG, "Streaming model tensor input dimensions has improper dimensions.");
83 return false;
84 }
85
86 if (input->type != kTfLiteInt8) {
87 ESP_LOGE(TAG, "Streaming model tensor input is not int8.");
88 return false;
89 }
90
91 // Verify output tensor matches expected values
92 TfLiteTensor *output = this->interpreter_->output(0);
93 if ((output->dims->size != 2) || (output->dims->data[0] != 1) || (output->dims->data[1] != 1)) {
94 ESP_LOGE(TAG, "Streaming model tensor output dimension is not 1x1.");
95 return false;
96 }
97
98 if (output->type != kTfLiteUInt8) {
99 ESP_LOGE(TAG, "Streaming model tensor output is not uint8.");
100 return false;
101 }
102 }
103
104 this->loaded_ = true;
105 this->reset_probabilities();
106 return true;
107}
108
110 RAMAllocator<uint8_t> arena_allocator;
111
112 // Try with the manifest size first, then escalates to 1.5, then 2x if it fails. Different platforms and different
113 // versions of the esp-nn library require different amounts of memory, so the manifest size may not always be correct,
114 // and probing allows us to find the actual required size for the current build and platform. Aligns test sizes to 16
115 // bytes.
116 size_t attempt_sizes[] = {(this->tensor_arena_size_ + 15) & ~15, (this->tensor_arena_size_ * 3 / 2 + 15) & ~15,
117 (this->tensor_arena_size_ * 2 + 15) & ~15};
118
119 for (size_t attempt_size : attempt_sizes) {
120 uint8_t *probe_arena = arena_allocator.allocate(attempt_size);
121 if (probe_arena == nullptr) {
122 continue;
123 }
124
125 // Verify the model works at all with this arena size
126 auto probe_interpreter = make_unique<tflite::MicroInterpreter>(
127 tflite::GetModel(this->model_start_), this->streaming_op_resolver_, probe_arena, attempt_size, this->mrv_);
128
129 if (probe_interpreter->AllocateTensors() != kTfLiteOk) {
130 probe_interpreter.reset();
131 arena_allocator.deallocate(probe_arena, attempt_size);
132 this->ma_ = tflite::MicroAllocator::Create(this->var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);
133 this->mrv_ = tflite::MicroResourceVariables::Create(this->ma_, 20);
134 continue;
135 }
136
137 // Try to shrink the arena. Start with arena_used_bytes() + 16 (rounded to 16-byte alignment).
138 // If that works, use it. Otherwise, try midpoints between that and the full size until one succeeds.
139 size_t lower = (probe_interpreter->arena_used_bytes() + 16 + 15) & ~15;
140 probe_interpreter.reset();
141 this->ma_ = tflite::MicroAllocator::Create(this->var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);
142 this->mrv_ = tflite::MicroResourceVariables::Create(this->ma_, 20);
143
144 size_t upper = attempt_size;
145
146 while (lower < upper) {
147 auto test_interpreter = make_unique<tflite::MicroInterpreter>(
148 tflite::GetModel(this->model_start_), this->streaming_op_resolver_, probe_arena, lower, this->mrv_);
149
150 bool ok = test_interpreter->AllocateTensors() == kTfLiteOk;
151
152 test_interpreter.reset();
153 this->ma_ = tflite::MicroAllocator::Create(this->var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);
154 this->mrv_ = tflite::MicroResourceVariables::Create(this->ma_, 20);
155
156 if (ok) {
157 // Found a working size smaller than the full arena
158 upper = lower + 16; // Pad by 16 bytes to be safe for future allocations
159 break;
160 }
161
162 // Try the midpoint between current attempt and full size
163 lower = ((lower + upper) / 2 + 15) & ~15;
164 }
165
166 arena_allocator.deallocate(probe_arena, attempt_size);
167 return upper;
168 }
169
170 return 0;
171}
172
174 this->interpreter_.reset();
175
176 RAMAllocator<uint8_t> arena_allocator;
177
178 if (this->tensor_arena_ != nullptr) {
179 arena_allocator.deallocate(this->tensor_arena_, this->tensor_arena_size_);
180 this->tensor_arena_ = nullptr;
181 }
182
183 if (this->var_arena_ != nullptr) {
184 arena_allocator.deallocate(this->var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);
185 this->var_arena_ = nullptr;
186 }
187
188 this->loaded_ = false;
189}
190
191bool StreamingModel::perform_streaming_inference(const int8_t features[PREPROCESSOR_FEATURE_SIZE]) {
192 if (this->enabled_ && !this->loaded_) {
193 // Model is enabled but isn't loaded
194 if (!this->load_model_()) {
195 return false;
196 }
197 }
198
199 if (!this->enabled_ && this->loaded_) {
200 // Model is disabled but still loaded
201 this->unload_model();
202 return true;
203 }
204
205 if (this->loaded_) {
206 TfLiteTensor *input = this->interpreter_->input(0);
207
208 uint8_t stride = this->interpreter_->input(0)->dims->data[1];
209 this->current_stride_step_ = this->current_stride_step_ % stride;
210
211 std::memmove(
212 (int8_t *) (tflite::GetTensorData<int8_t>(input)) + PREPROCESSOR_FEATURE_SIZE * this->current_stride_step_,
213 features, PREPROCESSOR_FEATURE_SIZE);
214 ++this->current_stride_step_;
215
216 if (this->current_stride_step_ >= stride) {
217 TfLiteStatus invoke_status = this->interpreter_->Invoke();
218 if (invoke_status != kTfLiteOk) {
219 ESP_LOGW(TAG, "Streaming interpreter invoke failed");
220 return false;
221 }
222
223 TfLiteTensor *output = this->interpreter_->output(0);
224
225 ++this->last_n_index_;
226 if (this->last_n_index_ == this->sliding_window_size_)
227 this->last_n_index_ = 0;
228 this->recent_streaming_probabilities_[this->last_n_index_] = output->data.uint8[0]; // probability;
230 }
232 // Only increment ignore windows if less than the probability cutoff; this forces the model to "cool-off" from a
233 // previous detection and calling ``reset_probabilities`` so it avoids duplicate detections
234 this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);
235 }
236 }
237 return true;
238}
239
241 for (auto &prob : this->recent_streaming_probabilities_) {
242 prob = 0;
243 }
244 this->ignore_windows_ = -MIN_SLICES_BEFORE_DETECTION;
245}
246
247WakeWordModel::WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t default_probability_cutoff,
248 size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size,
249 bool default_enabled, bool internal_only) {
250 this->id_ = id;
251 this->model_start_ = model_start;
252 this->default_probability_cutoff_ = default_probability_cutoff;
253 this->probability_cutoff_ = default_probability_cutoff;
254 this->sliding_window_size_ = sliding_window_average_size;
255 this->recent_streaming_probabilities_.resize(sliding_window_average_size, 0);
256 this->wake_word_ = wake_word;
257 this->tensor_arena_size_ = tensor_arena_size;
259 this->current_stride_step_ = 0;
260 this->internal_only_ = internal_only;
261
263 bool enabled;
264 if (this->pref_.load(&enabled)) {
265 // Use the enabled state loaded from flash
266 this->enabled_ = enabled;
267 } else {
268 // If no state saved, then use the default
269 this->enabled_ = default_enabled;
270 }
271};
272
274 this->enabled_ = true;
275 if (!this->internal_only_) {
276 this->pref_.save(&this->enabled_);
277 }
278}
279
281 this->enabled_ = false;
282 if (!this->internal_only_) {
283 this->pref_.save(&this->enabled_);
284 }
285}
286
288 DetectionEvent detection_event;
289 detection_event.wake_word = &this->wake_word_;
290 detection_event.max_probability = 0;
291 detection_event.average_probability = 0;
292
293 if ((this->ignore_windows_ < 0) || !this->enabled_) {
294 detection_event.detected = false;
295 return detection_event;
296 }
297
298 uint32_t sum = 0;
299 for (auto &prob : this->recent_streaming_probabilities_) {
300 detection_event.max_probability = std::max(detection_event.max_probability, prob);
301 sum += prob;
302 }
303
304 detection_event.average_probability = sum / this->sliding_window_size_;
305 detection_event.detected = sum > this->probability_cutoff_ * this->sliding_window_size_;
306
308 return detection_event;
309}
310
311VADModel::VADModel(const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_size,
312 size_t tensor_arena_size) {
313 this->model_start_ = model_start;
314 this->default_probability_cutoff_ = default_probability_cutoff;
315 this->probability_cutoff_ = default_probability_cutoff;
316 this->sliding_window_size_ = sliding_window_size;
317 this->recent_streaming_probabilities_.resize(sliding_window_size, 0);
318 this->tensor_arena_size_ = tensor_arena_size;
320}
321
323 DetectionEvent detection_event;
324 detection_event.max_probability = 0;
325 detection_event.average_probability = 0;
326
327 if (!this->enabled_) {
328 // We disabled the VAD model for some reason... so we shouldn't block wake words from being detected
329 detection_event.detected = true;
330 return detection_event;
331 }
332
333 uint32_t sum = 0;
334 for (auto &prob : this->recent_streaming_probabilities_) {
335 detection_event.max_probability = std::max(detection_event.max_probability, prob);
336 sum += prob;
337 }
338
339 detection_event.average_probability = sum / this->sliding_window_size_;
340 detection_event.detected = sum > (this->probability_cutoff_ * this->sliding_window_size_);
341
342 return detection_event;
343}
344
345bool StreamingModel::register_streaming_ops_(tflite::MicroMutableOpResolver<20> &op_resolver) {
346 if (op_resolver.AddCallOnce() != kTfLiteOk)
347 return false;
348 if (op_resolver.AddVarHandle() != kTfLiteOk)
349 return false;
350 if (op_resolver.AddReshape() != kTfLiteOk)
351 return false;
352 if (op_resolver.AddReadVariable() != kTfLiteOk)
353 return false;
354 if (op_resolver.AddStridedSlice() != kTfLiteOk)
355 return false;
356 if (op_resolver.AddConcatenation() != kTfLiteOk)
357 return false;
358 if (op_resolver.AddAssignVariable() != kTfLiteOk)
359 return false;
360 if (op_resolver.AddConv2D() != kTfLiteOk)
361 return false;
362 if (op_resolver.AddMul() != kTfLiteOk)
363 return false;
364 if (op_resolver.AddAdd() != kTfLiteOk)
365 return false;
366 if (op_resolver.AddMean() != kTfLiteOk)
367 return false;
368 if (op_resolver.AddFullyConnected() != kTfLiteOk)
369 return false;
370 if (op_resolver.AddLogistic() != kTfLiteOk)
371 return false;
372 if (op_resolver.AddQuantize() != kTfLiteOk)
373 return false;
374 if (op_resolver.AddDepthwiseConv2D() != kTfLiteOk)
375 return false;
376 if (op_resolver.AddAveragePool2D() != kTfLiteOk)
377 return false;
378 if (op_resolver.AddMaxPool2D() != kTfLiteOk)
379 return false;
380 if (op_resolver.AddPad() != kTfLiteOk)
381 return false;
382 if (op_resolver.AddPack() != kTfLiteOk)
383 return false;
384 if (op_resolver.AddSplitV() != kTfLiteOk)
385 return false;
386
387 return true;
388}
389
390} // namespace micro_wake_word
391} // namespace esphome
392
393#endif
An STL allocator that uses SPI or internal RAM.
Definition helpers.h:2212
void deallocate(T *p, size_t n)
Definition helpers.h:2267
T * allocate(size_t n)
Definition helpers.h:2229
bool load_model_()
Allocates tensor and variable arenas and sets up the model interpreter.
std::unique_ptr< tflite::MicroInterpreter > interpreter_
tflite::MicroMutableOpResolver< 20 > streaming_op_resolver_
bool register_streaming_ops_(tflite::MicroMutableOpResolver< 20 > &op_resolver)
Returns true if successfully registered the streaming model's TensorFlow operations.
void reset_probabilities()
Sets all recent_streaming_probabilities to 0 and resets the ignore window count.
std::vector< uint8_t > recent_streaming_probabilities_
size_t probe_arena_size_()
Probes the actual required tensor arena size by trial allocation.
tflite::MicroResourceVariables * mrv_
bool perform_streaming_inference(const int8_t features[PREPROCESSOR_FEATURE_SIZE])
void unload_model()
Destroys the TFLite interpreter and frees the tensor and variable arenas' memory.
DetectionEvent determine_detected() override
Checks for voice activity by comparing the max probability in the sliding window with the probability...
VADModel(const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_size, size_t tensor_arena_size)
void enable() override
Enable the model and save to flash. The next performing_streaming_inference call will load it.
DetectionEvent determine_detected() override
Checks for the wake word by comparing the mean probability in the sliding window with the probability...
WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size, bool default_enabled, bool internal_only)
Constructs a wake word model object.
void disable() override
Disable the model and save to flash. The next performing_streaming_inference call will unload it.
uint16_t id
Providing packet encoding functions for exchanging data with a remote host.
Definition a01nyub.cpp:7
ESPPreferences * global_preferences
uint32_t fnv1_hash(const char *str)
Calculate a FNV-1 hash of str.
Definition helpers.cpp:161
static void uint32_t
ESPPreferenceObject make_preference(size_t, uint32_t, bool)
Definition preferences.h:24