ESPHome 2026.6.0-dev
Loading...
Searching...
No Matches
voice_assistant.cpp
Go to the documentation of this file.
1#include "voice_assistant.h"
3
4#ifdef USE_VOICE_ASSISTANT
5
8#include "esphome/core/log.h"
9
10#include <cinttypes>
11#include <cstdio>
12
14
15static const char *const TAG = "voice_assistant";
16
17#ifdef SAMPLE_RATE_HZ
18#undef SAMPLE_RATE_HZ
19#endif
20
21static const size_t SAMPLE_RATE_HZ = 16000;
22
23static const size_t RING_BUFFER_SAMPLES = 512 * SAMPLE_RATE_HZ / 1000; // 512 ms * 16 kHz/ 1000 ms
24static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES * sizeof(int16_t);
25static const size_t SEND_BUFFER_SAMPLES = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
26static const size_t SEND_BUFFER_SIZE = SEND_BUFFER_SAMPLES * sizeof(int16_t);
27static const size_t RECEIVE_SIZE = 1024;
28static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
29
30// If one microphone channel keeps producing audio while another configured channel produces none for this
31// long, treat the silent channel as failed and stop the stream. A working microphone exposes a chunk every
32// SEND_BUFFER_SAMPLES (32 ms), so this is far longer than any legitimate gap between chunks.
33static const uint32_t AUDIO_CHANNEL_STALL_TIMEOUT_MS = 2000;
34
36
38 this->mic_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
39 std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->ring_buffer_.lock();
40 if (temp_ring_buffer != nullptr) {
41 temp_ring_buffer->write((void *) data.data(), data.size());
42 }
43 });
44
45 // Second microphone channel
46 if (this->mic_source2_ != nullptr) {
47 this->mic_source2_->add_data_callback([this](const std::vector<uint8_t> &data) {
48 std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->ring_buffer2_.lock();
49 if (temp_ring_buffer != nullptr) {
50 temp_ring_buffer->write((void *) data.data(), data.size());
51 }
52 });
53 }
54
55#ifdef USE_MEDIA_PLAYER
56 if (this->media_player_ != nullptr) {
58 switch (state) {
61 // State changed to announcing after receiving the url
63 }
64 break;
65 default:
67 // No longer announcing the TTS response
69 }
70 break;
71 }
72 });
73 }
74#endif
75}
76
78
80 this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
81 if (this->socket_ == nullptr) {
82 ESP_LOGE(TAG, "Could not create socket");
83 this->mark_failed();
84 return false;
85 }
86 int enable = 1;
87 int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
88 if (err != 0) {
89 ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
90 // we can still continue
91 }
92 err = this->socket_->setblocking(false);
93 if (err != 0) {
94 ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
95 this->mark_failed();
96 return false;
97 }
98
99#ifdef USE_SPEAKER
100 if (this->speaker_ != nullptr) {
101 struct sockaddr_storage server;
102
103 socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
104 if (sl == 0) {
105 ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
106 this->mark_failed();
107 return false;
108 }
109
110 err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
111 if (err != 0) {
112 ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
113 this->mark_failed();
114 return false;
115 }
116 }
117#endif
118 this->udp_socket_running_ = true;
119 return true;
120}
121
123#ifdef USE_SPEAKER
124 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ == nullptr)) {
125 RAMAllocator<uint8_t> speaker_allocator;
126 this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
127 if (this->speaker_buffer_ == nullptr) {
128 ESP_LOGW(TAG, "Could not allocate speaker buffer");
129 return false;
130 }
131 }
132#endif
133
134 if (this->audio_source_ == nullptr) {
135 std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = ring_buffer::RingBuffer::create(RING_BUFFER_SIZE);
136 if (temp_ring_buffer == nullptr) {
137 ESP_LOGE(TAG, "Could not allocate ring buffer");
138 return false;
139 }
140 // Zero-copy source that reads directly from the ring buffer; frame-aligned to never split an int16 sample.
141 this->audio_source_ = audio::RingBufferAudioSource::create(temp_ring_buffer, SEND_BUFFER_SIZE, sizeof(int16_t));
142 if (this->audio_source_ == nullptr) {
143 ESP_LOGE(TAG, "Could not allocate audio source");
144 return false;
145 }
146 this->ring_buffer_ = temp_ring_buffer;
147 }
148
149 // Second microphone channel
150 if ((this->mic_source2_ != nullptr) && (this->audio_source2_ == nullptr)) {
151 std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = ring_buffer::RingBuffer::create(RING_BUFFER_SIZE);
152 if (temp_ring_buffer == nullptr) {
153 ESP_LOGE(TAG, "Could not allocate second ring buffer");
154 return false;
155 }
156 this->audio_source2_ = audio::RingBufferAudioSource::create(temp_ring_buffer, SEND_BUFFER_SIZE, sizeof(int16_t));
157 if (this->audio_source2_ == nullptr) {
158 ESP_LOGE(TAG, "Could not allocate second audio source");
159 return false;
160 }
161 this->ring_buffer2_ = temp_ring_buffer;
162 }
163
164 return true;
165}
166
168 if (this->audio_source_ != nullptr) {
169 this->audio_source_->clear_buffered_data();
170 }
171
172 // Second microphone channel
173 if (this->audio_source2_ != nullptr) {
174 this->audio_source2_->clear_buffered_data();
175 }
176
177 // Reset the multi-channel stall watchdog (see audio_channel_stall_start_).
179
180#ifdef USE_SPEAKER
181 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
182 memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
183
184 this->speaker_buffer_size_ = 0;
185 this->speaker_buffer_index_ = 0;
186 this->speaker_bytes_received_ = 0;
187 }
188#endif
189}
190
192 // Destroying each source releases its ring buffer; the matching weak_ptr then expires automatically.
193 this->audio_source_.reset();
194
195 // Second microphone channel
196 this->audio_source2_.reset();
197
198#ifdef USE_SPEAKER
199 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
200 RAMAllocator<uint8_t> speaker_deallocator;
201 speaker_deallocator.deallocate(this->speaker_buffer_, SPEAKER_BUFFER_SIZE);
202 this->speaker_buffer_ = nullptr;
203 }
204#endif
205}
206
208 this->conversation_id_ = "";
209 ESP_LOGD(TAG, "reset conversation ID");
210}
211
213 // Both microphone channels are sent together, if configured. Home Assistant feeds one of the
214 // channels to its speech-to-text stream and treats an empty payload on that channel as
215 // end-of-stream, and the device cannot know which channel it picked, so only send once every
216 // configured channel has audio exposed, and always send them together. We don't target any
217 // particular message size: Home Assistant re-chunks the audio, and each fill() exposes at most
218 // SEND_BUFFER_SIZE bytes.
219 while (true) {
220 // fill() exposes a new chunk, or returns 0 if a previous chunk is still exposed; available()
221 // reports the currently exposed bytes either way.
222 this->audio_source_->fill(0, false);
223 size_t available = this->audio_source_->available();
224 size_t available2 = 0;
225 if (this->audio_source2_ != nullptr) {
226 this->audio_source2_->fill(0, false);
227 available2 = this->audio_source2_->available();
228 }
229
230 const bool channel_empty = (available == 0);
231 const bool channel2_empty = (this->audio_source2_ != nullptr) && (available2 == 0);
232 if (channel_empty || channel2_empty) {
233 // A configured channel has no audio yet, so keep any chunk exposed on the other channel for the
234 // next pass rather than sending an empty payload.
235 this->handle_channel_stall_(available, available2);
236 break;
237 }
238
239 // Both channels have audio exposed; clear any in-progress stall timer.
241
243 // Zero-copy: send_message() copies the data out before we consume it.
244 msg.data = this->audio_source_->data();
245 msg.data_len = available;
246 if (this->audio_source2_ != nullptr) {
247 msg.data2 = this->audio_source2_->data();
248 msg.data2_len = available2;
249 }
250
251 this->api_client_->send_message(msg);
252
253 this->audio_source_->consume(available);
254 if (this->audio_source2_ != nullptr) {
255 this->audio_source2_->consume(available2);
256 }
257 }
258}
259
260void VoiceAssistant::handle_channel_stall_(size_t available, size_t available2) {
261 // Called when at least one configured channel has no audio exposed. When one channel has data and the
262 // other does not, watch how long the empty channel stays starved: Home Assistant has no stream timeout
263 // and would never tell us to stop, so a channel that fails outright would otherwise hang streaming
264 // forever with the live channel's chunk held. Stop the stream with an error after a prolonged imbalance.
265 if ((available == 0) && (available2 == 0)) {
266 // Both channels are idle (no audio buffered yet); normal, not a stalled channel.
268 return;
269 }
270
272 if (this->audio_channel_stall_start_ == 0) {
273 this->audio_channel_stall_start_ = now;
274 } else if ((now - this->audio_channel_stall_start_) >= AUDIO_CHANNEL_STALL_TIMEOUT_MS) {
275 ESP_LOGW(TAG, "Mic channel %d stalled, stopping stream", (available == 0) ? 0 : 1);
277 this->signal_stop_();
279 this->defer([this]() {
280 this->error_trigger_.trigger("mic-channel-stalled", "A microphone channel stopped producing audio");
281 });
282 }
283}
284
286 if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
288 if (this->mic_source_->is_running() || (this->mic_source2_ && this->mic_source2_->is_running()) ||
289 this->state_ == State::STARTING_MICROPHONE) {
291 } else {
293 }
294 this->continuous_ = false;
295 this->signal_stop_();
296 this->clear_buffers_();
297 return;
298 }
299 switch (this->state_) {
300 case State::IDLE: {
301 if (this->continuous_ && this->desired_state_ == State::IDLE) {
302 this->idle_trigger_.trigger();
304 } else {
305 this->deallocate_buffers_();
306 }
307 break;
308 }
310 ESP_LOGD(TAG, "Starting Microphone");
311 if (!this->allocate_buffers_()) {
312 this->status_set_error(LOG_STR("Failed to allocate buffers"));
313 return;
314 }
315 if (this->status_has_error()) {
316 this->status_clear_error();
317 }
318 this->clear_buffers_();
319
320 this->mic_source_->start();
321 if (this->mic_source2_) {
322 this->mic_source2_->start();
323 }
325 break;
326 }
328 if (this->mic_source_->is_running() && (!this->mic_source2_ || this->mic_source2_->is_running())) {
329 this->set_state_(this->desired_state_);
330 }
331 break;
332 }
334 ESP_LOGD(TAG, "Requesting start");
335 uint32_t flags = 0;
336 if (!this->continue_conversation_ && this->use_wake_word_)
338 if (this->silence_detection_)
342 audio_settings.auto_gain = this->auto_gain_;
343 audio_settings.volume_multiplier = this->volume_multiplier_;
344
346 msg.start = true;
348 msg.flags = flags;
349 msg.audio_settings = audio_settings;
351
352 // Reset media player state tracking
353#ifdef USE_MEDIA_PLAYER
354 if (this->media_player_ != nullptr) {
356 }
357#endif
358
359 if (this->api_client_ == nullptr || !this->api_client_->send_message(msg)) {
360 ESP_LOGW(TAG, "Could not request start");
361 this->error_trigger_.trigger("not-connected", "Could not request start");
362 this->continuous_ = false;
364 break;
365 }
367 this->set_timeout("reset-conversation_id", this->conversation_timeout_,
368 [this]() { this->reset_conversation_id(); });
369 break;
370 }
372 break; // State changed when udp server port received
373 }
375 // pre_shift is ignored by RingBufferAudioSource (no intermediate transfer buffer to compact).
376 if (this->audio_mode_ == AUDIO_MODE_API) {
377 this->stream_api_audio_();
378 } else {
379 // UDP (will eventually be deprecated)
380 // Only the primary microphone channel is used
381 while (true) {
382 this->audio_source_->fill(0, false);
383 size_t available = this->audio_source_->available();
384 if (available == 0) {
385 break;
386 }
387 if (!this->udp_socket_running_) {
388 if (!this->start_udp_socket_()) {
390 break;
391 }
392 }
393 this->socket_->sendto(this->audio_source_->data(), available, 0, (struct sockaddr *) &this->dest_addr_,
394 sizeof(this->dest_addr_));
395 this->audio_source_->consume(available);
396 }
397 } // audio mode
398 break;
399 }
401 // Check both microphone channels
402 bool is_running = this->mic_source_->is_running();
403 bool is_running2 = false;
404 if (this->mic_source2_) {
405 is_running2 = this->mic_source2_->is_running();
406 }
407 if (is_running || is_running2) {
408 if (is_running) {
409 this->mic_source_->stop();
410 }
411 if (is_running2) {
412 this->mic_source2_->stop();
413 }
415 } else {
416 this->set_state_(this->desired_state_);
417 }
418 break;
419 }
421 // Check both microphone channels
422 bool is_stopped = this->mic_source_->is_stopped();
423 bool is_stopped2 = true;
424 if (this->mic_source2_) {
425 is_stopped2 = this->mic_source2_->is_stopped();
426 }
427 if (is_stopped && is_stopped2) {
428 this->set_state_(this->desired_state_);
429 }
430 break;
431 }
433 break; // State changed by events
434 }
436 bool playing = false;
437#ifdef USE_SPEAKER
438 if (this->speaker_ != nullptr) {
439 ssize_t received_len = 0;
440 if (this->audio_mode_ == AUDIO_MODE_UDP) {
441 if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
442 received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
443 if (received_len > 0) {
444 this->speaker_buffer_index_ += received_len;
445 this->speaker_buffer_size_ += received_len;
446 this->speaker_bytes_received_ += received_len;
447 }
448 } else {
449 ESP_LOGD(TAG, "Receive buffer full");
450 }
451 }
452 // Build a small buffer of audio before sending to the speaker
453 bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
454 if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
455 this->write_speaker_();
456 if (this->wait_for_stream_end_) {
457 this->cancel_timeout("playing");
458 if (end_of_stream) {
459 ESP_LOGD(TAG, "End of audio stream received");
460 this->cancel_timeout("speaker-timeout");
462 }
463 break; // We dont want to timeout here as the STREAM_END event will take care of that.
464 }
465 playing = this->speaker_->is_running();
466 }
467#endif
468#ifdef USE_MEDIA_PLAYER
469 if (this->media_player_ != nullptr) {
471
474 this->cancel_timeout("playing");
475 ESP_LOGD(TAG, "Announcement finished playing");
477
479 msg.success = true;
480 this->api_client_->send_message(msg);
481 break;
482 }
483 }
484#endif
485 if (playing) {
487 }
488 break;
489 }
491#ifdef USE_SPEAKER
492 if (this->speaker_ != nullptr) {
493 if (this->speaker_buffer_size_ > 0) {
494 this->write_speaker_();
495 break;
496 }
497 if (this->speaker_->has_buffered_data() || this->speaker_->is_running()) {
498 break;
499 }
500 ESP_LOGD(TAG, "Speaker has finished outputting all audio");
501 this->speaker_->stop();
502 this->cancel_timeout("speaker-timeout");
503 this->cancel_timeout("playing");
504
505 this->clear_buffers_();
506
507 this->wait_for_stream_end_ = false;
508 this->stream_ended_ = false;
509
511 }
512#endif
513 if (this->continue_conversation_) {
515 } else {
517 }
518 break;
519 }
520 default:
521 break;
522 }
523}
524
525#ifdef USE_SPEAKER
527 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
528 if (this->speaker_buffer_size_ > 0) {
529 size_t write_chunk = std::min<size_t>(this->speaker_buffer_size_, 4 * 1024);
530 size_t written = this->speaker_->play(this->speaker_buffer_, write_chunk);
531 if (written > 0) {
532 memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written);
535 this->set_timeout("speaker-timeout", 5000, [this]() { this->speaker_->stop(); });
536 } else {
537 ESP_LOGV(TAG, "Speaker buffer full, trying again next loop");
538 }
539 }
540 }
541}
542#endif
543
545 if (!subscribe) {
546 if (this->api_client_ == nullptr || client != this->api_client_) {
547 ESP_LOGE(TAG, "Client attempting to unsubscribe that is not the current API Client");
548 return;
549 }
550 this->api_client_ = nullptr;
552 return;
553 }
554
555 if (this->api_client_ != nullptr) {
556 char current_peername[socket::SOCKADDR_STR_LEN];
557 char new_peername[socket::SOCKADDR_STR_LEN];
558 ESP_LOGE(TAG,
559 "Multiple API Clients attempting to connect to Voice Assistant\n"
560 " Current client: %s (%s)\n"
561 " New client: %s (%s)",
562 this->api_client_->get_name(), this->api_client_->get_peername_to(current_peername), client->get_name(),
563 client->get_peername_to(new_peername));
564 return;
565 }
566
567 this->api_client_ = client;
569}
570
571static const LogString *voice_assistant_state_to_string(State state) {
572 switch (state) {
573 case State::IDLE:
574 return LOG_STR("IDLE");
576 return LOG_STR("START_MICROPHONE");
578 return LOG_STR("STARTING_MICROPHONE");
580 return LOG_STR("WAIT_FOR_VAD");
582 return LOG_STR("WAITING_FOR_VAD");
584 return LOG_STR("START_PIPELINE");
586 return LOG_STR("STARTING_PIPELINE");
588 return LOG_STR("STREAMING_MICROPHONE");
590 return LOG_STR("STOP_MICROPHONE");
592 return LOG_STR("STOPPING_MICROPHONE");
594 return LOG_STR("AWAITING_RESPONSE");
596 return LOG_STR("STREAMING_RESPONSE");
598 return LOG_STR("RESPONSE_FINISHED");
599 default:
600 return LOG_STR("UNKNOWN");
601 }
602};
603
605 State old_state = this->state_;
606 this->state_ = state;
607 ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
608 LOG_STR_ARG(voice_assistant_state_to_string(state)));
609}
610
612 this->set_state_(state);
613 this->desired_state_ = desired_state;
614 ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
615}
616
618 ESP_LOGE(TAG, "Failed to start server. See Home Assistant logs for more details.");
619 this->error_trigger_.trigger("failed-to-start", "Failed to start server. See Home Assistant logs for more details.");
621}
622
624 if (this->state_ != State::STARTING_PIPELINE) {
625 this->signal_stop_();
626 return;
627 }
628
629 ESP_LOGD(TAG, "Client started, streaming microphone");
631
632 // Both microphone channels
633 if (this->mic_source_->is_running() && (!this->mic_source2_ || this->mic_source2_->is_running())) {
635 } else {
637 }
638}
639
640void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
641 if (this->state_ != State::STARTING_PIPELINE) {
642 this->signal_stop_();
643 return;
644 }
645
646 ESP_LOGD(TAG, "Client started, streaming microphone");
648
649 if (this->mic_source2_ != nullptr) {
650 ESP_LOGW(TAG, "UDP audio mode does not support a second microphone channel; only the primary will be streamed");
651 }
652
653 memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
654 if (this->dest_addr_.ss_family == AF_INET) {
655 ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
656 }
657#if LWIP_IPV6
658 else if (this->dest_addr_.ss_family == AF_INET6) {
659 ((struct sockaddr_in6 *) &this->dest_addr_)->sin6_port = htons(port);
660 }
661#endif
662 else {
663 ESP_LOGW(TAG, "Unknown address family: %d", this->dest_addr_.ss_family);
664 return;
665 }
666
667 // Only primary microphone channel over UDP
668 if (this->mic_source_->is_running()) {
670 } else {
672 }
673}
674
675void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
676 if (this->api_client_ == nullptr) {
677 ESP_LOGE(TAG, "No API client connected");
679 this->continuous_ = false;
680 return;
681 }
682 if (this->state_ == State::IDLE) {
683 this->continuous_ = continuous;
684 this->silence_detection_ = silence_detection;
685
687 }
688}
689
691 this->continuous_ = false;
692 this->continue_conversation_ = false;
693
694 switch (this->state_) {
695 case State::IDLE:
696 break;
703 break;
706 this->signal_stop_();
708 break;
712 break;
714 this->signal_stop_();
715 break;
717#ifdef USE_MEDIA_PLAYER
718 // Stop any ongoing media player announcement
719 if (this->media_player_ != nullptr) {
720 this->media_player_->make_call()
722 .set_announcement(true)
723 .perform();
724 }
725 if (this->started_streaming_tts_) {
726 // Haven't reached the TTS_END stage, so send the stop signal to HA.
727 this->signal_stop_();
728 }
729#endif
730 break;
732 break; // Let the incoming audio stream finish then it will go to idle.
733 }
734}
735
737 memset(&this->dest_addr_, 0, sizeof(this->dest_addr_));
738 if (this->api_client_ == nullptr) {
739 return;
740 }
741 ESP_LOGD(TAG, "Signaling stop");
743 msg.start = false;
744 this->api_client_->send_message(msg);
745}
746
748 this->set_timeout("playing", 2000, [this]() {
749 this->cancel_timeout("speaker-timeout");
751
752 if (this->api_client_ == nullptr)
753 return;
755 msg.success = true;
756 this->api_client_->send_message(msg);
757 });
758}
759
761 ESP_LOGD(TAG, "Event Type: %" PRId32, msg.event_type);
762 switch (msg.event_type) {
764 ESP_LOGD(TAG, "Assist Pipeline running");
765#ifdef USE_MEDIA_PLAYER
766 this->started_streaming_tts_ = false;
767 for (const auto &arg : msg.data) {
768 if (arg.name == "url") {
769 this->tts_response_url_ = arg.value;
770 }
771 }
772#endif
773 this->defer([this]() { this->start_trigger_.trigger(); });
774 break;
776 break;
778 ESP_LOGD(TAG, "Wake word detected");
779 this->defer([this]() { this->wake_word_detected_trigger_.trigger(); });
780 break;
781 }
783 ESP_LOGD(TAG, "STT started");
784 this->defer([this]() { this->listening_trigger_.trigger(); });
785 break;
787 std::string text;
788 for (const auto &arg : msg.data) {
789 if (arg.name == "text") {
790 text = arg.value;
791 }
792 }
793 if (text.empty()) {
794 ESP_LOGW(TAG, "No text in STT_END event");
795 return;
796 } else if (text.length() > 500) {
797 text.resize(497);
798 text += "...";
799 }
800 ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
801 this->defer([this, text]() { this->stt_end_trigger_.trigger(text); });
802 break;
803 }
805 ESP_LOGD(TAG, "Intent started");
806 this->defer([this]() { this->intent_start_trigger_.trigger(); });
807 break;
809 ESP_LOGD(TAG, "Intent progress");
810 std::string tts_url_for_trigger;
811#ifdef USE_MEDIA_PLAYER
812 if (this->media_player_ != nullptr) {
813 for (const auto &arg : msg.data) {
814 if ((arg.name == "tts_start_streaming") && (arg.value == "1") && !this->tts_response_url_.empty()) {
816
818
819 this->started_streaming_tts_ = true;
821
822 tts_url_for_trigger = this->tts_response_url_;
823 this->tts_response_url_.clear(); // Reset streaming URL
825 }
826 }
827 }
828#endif
829 this->defer([this, tts_url_for_trigger]() { this->intent_progress_trigger_.trigger(tts_url_for_trigger); });
830 break;
831 }
833 for (const auto &arg : msg.data) {
834 if (arg.name == "conversation_id") {
835 this->conversation_id_ = arg.value;
836 } else if (arg.name == "continue_conversation") {
837 this->continue_conversation_ = (arg.value == "1");
838 }
839 }
840 this->defer([this]() { this->intent_end_trigger_.trigger(); });
841 break;
842 }
844 std::string text;
845 for (const auto &arg : msg.data) {
846 if (arg.name == "text") {
847 text = arg.value;
848 }
849 }
850 if (text.empty()) {
851 ESP_LOGW(TAG, "No text in TTS_START event");
852 return;
853 }
854 if (text.length() > 500) {
855 text.resize(497);
856 text += "...";
857 }
858 ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
859 this->defer([this, text]() {
860 this->tts_start_trigger_.trigger(text);
861#ifdef USE_SPEAKER
862 if (this->speaker_ != nullptr) {
863 this->speaker_->start();
864 }
865#endif
866 });
867 break;
868 }
870 std::string url;
871 for (const auto &arg : msg.data) {
872 if (arg.name == "url") {
873 url = arg.value;
874 }
875 }
876 if (url.empty()) {
877 ESP_LOGW(TAG, "No url in TTS_END event");
878 return;
879 }
880 ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
881 this->defer([this, url]() {
882#ifdef USE_MEDIA_PLAYER
883 if ((this->media_player_ != nullptr) && (!this->started_streaming_tts_)) {
885
887
889 }
890 this->started_streaming_tts_ = false; // Helps indicate reaching the TTS_END stage
891#endif
892 this->tts_end_trigger_.trigger(url);
893 });
895 if (new_state != this->state_) {
896 // Don't needlessly change the state. The intent progress stage may have already changed the state to
897 // streaming response.
898 this->set_state_(new_state, new_state);
899 }
900 break;
901 }
903 ESP_LOGD(TAG, "Assist Pipeline ended");
904 if ((this->state_ == State::START_PIPELINE) || (this->state_ == State::STARTING_PIPELINE) ||
906 // Microphone is running, stop it
908 } else if (this->state_ == State::AWAITING_RESPONSE) {
909 // No TTS start event ("nevermind")
911 }
912 this->defer([this]() { this->end_trigger_.trigger(); });
913 break;
914 }
916 std::string code;
917 std::string message;
918 for (const auto &arg : msg.data) {
919 if (arg.name == "code") {
920 code = arg.value;
921 } else if (arg.name == "message") {
922 message = arg.value;
923 }
924 }
925 if (code == "wake-word-timeout" || code == "wake_word_detection_aborted" || code == "no_wake_word") {
926 // Don't change state here since either the "tts-end" or "run-end" events will do it.
927 return;
928 } else if (code == "wake-provider-missing" || code == "wake-engine-missing") {
929 // Wake word is not set up or not ready on Home Assistant so stop and do not retry until user starts again.
930 this->defer([this, code, message]() {
931 this->request_stop();
932 this->error_trigger_.trigger(code, message);
933 });
934 return;
935 }
936 ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str());
937 if (this->state_ != State::IDLE) {
938 this->signal_stop_();
940 }
941 this->defer([this, code, message]() { this->error_trigger_.trigger(code, message); });
942 break;
943 }
945#ifdef USE_SPEAKER
946 if (this->speaker_ != nullptr) {
947 this->wait_for_stream_end_ = true;
948 ESP_LOGD(TAG, "TTS stream start");
949 this->defer([this] { this->tts_stream_start_trigger_.trigger(); });
950 }
951#endif
952 break;
953 }
955#ifdef USE_SPEAKER
956 if (this->speaker_ != nullptr) {
957 this->stream_ended_ = true;
958 ESP_LOGD(TAG, "TTS stream end");
959 }
960#endif
961 break;
962 }
964 ESP_LOGD(TAG, "Starting STT by VAD");
965 this->defer([this]() { this->stt_vad_start_trigger_.trigger(); });
966 break;
968 ESP_LOGD(TAG, "STT by VAD end");
970 this->defer([this]() { this->stt_vad_end_trigger_.trigger(); });
971 break;
972 default:
973 ESP_LOGD(TAG, "Unhandled event type: %" PRId32, msg.event_type);
974 break;
975 }
976}
977
979#ifdef USE_SPEAKER // We should never get to this function if there is no speaker anyway
980 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
981 if (this->speaker_buffer_index_ + msg.data_len < SPEAKER_BUFFER_SIZE) {
982 memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data, msg.data_len);
983 this->speaker_buffer_index_ += msg.data_len;
984 this->speaker_buffer_size_ += msg.data_len;
986 ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data_len);
987 } else {
988 ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
989 }
990 }
991#endif
992}
993
995 // Find existing timer or add a new one
996 auto it = this->timers_.begin();
997 for (; it != this->timers_.end(); ++it) {
998 if (it->id == msg.timer_id)
999 break;
1000 }
1001 if (it == this->timers_.end()) {
1002 this->timers_.push_back({});
1003 it = this->timers_.end() - 1;
1004 }
1005 it->id = msg.timer_id;
1006 it->name = msg.name;
1007 it->total_seconds = msg.total_seconds;
1008 it->seconds_left = msg.seconds_left;
1009 it->is_active = msg.is_active;
1010
1011 char timer_buf[Timer::TO_STR_BUFFER_SIZE];
1012 ESP_LOGD(TAG,
1013 "Timer Event\n"
1014 " Type: %" PRId32 "\n"
1015 " %s",
1016 msg.event_type, it->to_str(timer_buf));
1017
1018 switch (msg.event_type) {
1020 this->timer_started_trigger_.trigger(*it);
1021 break;
1023 this->timer_updated_trigger_.trigger(*it);
1024 break;
1026 this->timer_cancelled_trigger_.trigger(*it);
1027 this->timers_.erase(it);
1028 break;
1030 this->timer_finished_trigger_.trigger(*it);
1031 this->timers_.erase(it);
1032 break;
1033 }
1034
1035 if (this->timers_.empty()) {
1036 this->cancel_interval("timer-event");
1037 this->timer_tick_running_ = false;
1038 } else if (!this->timer_tick_running_) {
1039 this->set_interval("timer-event", 1000, [this]() { this->timer_tick_(); });
1040 this->timer_tick_running_ = true;
1041 }
1042}
1043
1045 for (auto &timer : this->timers_) {
1046 if (timer.is_active && timer.seconds_left > 0) {
1047 timer.seconds_left--;
1048 }
1049 }
1050 this->timer_tick_trigger_.trigger(this->timers_);
1051}
1052
1054#ifdef USE_MEDIA_PLAYER
1055 if (this->media_player_ != nullptr) {
1056 this->tts_start_trigger_.trigger(msg.text);
1057
1059
1060 if (!msg.preannounce_media_id.empty()) {
1062 }
1063 // Enqueueing a URL with an empty playlist will still play the file immediately
1064 this->media_player_->make_call()
1067 .set_announcement(true)
1068 .perform();
1070
1072
1073 if (this->continuous_) {
1075 } else {
1077 }
1078
1080 this->end_trigger_.trigger();
1081 }
1082#endif
1083}
1084
1085void VoiceAssistant::on_set_configuration(const std::vector<std::string> &active_wake_words) {
1086#ifdef USE_MICRO_WAKE_WORD
1087 if (this->micro_wake_word_) {
1088 // Disable all wake words first
1089 for (auto &model : this->micro_wake_word_->get_wake_words()) {
1090 model->disable();
1091 }
1092
1093 // Enable only active wake words
1094 for (const auto &ww_id : active_wake_words) {
1095 for (auto &model : this->micro_wake_word_->get_wake_words()) {
1096 if (model->get_id() == ww_id) {
1097 model->enable();
1098 ESP_LOGD(TAG, "Enabled wake word: %s (id=%s)", model->get_wake_word().c_str(), model->get_id().c_str());
1099 }
1100 }
1101 }
1102 }
1103#endif
1104};
1105
1107 this->config_.available_wake_words.clear();
1108 this->config_.active_wake_words.clear();
1109
1110#ifdef USE_MICRO_WAKE_WORD
1111 if (this->micro_wake_word_) {
1113
1114 for (auto &model : this->micro_wake_word_->get_wake_words()) {
1115 if (model->is_enabled()) {
1116 this->config_.active_wake_words.push_back(model->get_id());
1117 }
1118
1119 WakeWord wake_word;
1120 wake_word.id = model->get_id();
1121 wake_word.wake_word = model->get_wake_word();
1122 for (const auto &lang : model->get_trained_languages()) {
1123 wake_word.trained_languages.push_back(lang);
1124 }
1125 this->config_.available_wake_words.push_back(std::move(wake_word));
1126 }
1127 } else {
1128#endif
1129 // No microWakeWord
1131#ifdef USE_MICRO_WAKE_WORD
1132 }
1133#endif
1134
1135 return this->config_;
1136};
1137
1138VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
1139
1140} // namespace esphome::voice_assistant
1141
1142#endif // USE_VOICE_ASSISTANT
uint32_t IRAM_ATTR HOT get_loop_component_start_time() const
Get the cached time in milliseconds from when the current component started its loop execution.
void mark_failed()
Mark this component as failed.
ESPDEPRECATED("Use const char* overload instead. Removed in 2026.7.0", "2026.1.0") void defer(const std voi defer)(const char *name, std::function< void()> &&f)
Defer a callback to the next loop() call.
Definition component.h:543
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") void set_timeout(const std voi set_timeout)(const char *name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
Definition component.h:493
void status_clear_error()
Definition component.h:295
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") void set_interval(const std voi set_interval)(const char *name, uint32_t interval, std::function< void()> &&f)
Set an interval function with a unique name.
Definition component.h:400
bool status_has_error() const
Definition component.h:280
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") bool cancel_timeout(const std boo cancel_timeout)(const char *name)
Cancel a timeout function.
Definition component.h:515
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") bool cancel_interval(const std boo cancel_interval)(const char *name)
Cancel an interval function.
Definition component.h:422
An STL allocator that uses SPI or internal RAM.
Definition helpers.h:2053
void deallocate(T *p, size_t n)
Definition helpers.h:2110
T * allocate(size_t n)
Definition helpers.h:2080
StringRef is a reference to a string owned by something else.
Definition string_ref.h:26
constexpr bool empty() const
Definition string_ref.h:76
void trigger(const Ts &...x) ESPHOME_ALWAYS_INLINE
Inform the parent automation that the event has triggered.
Definition automation.h:461
const char * get_peername_to(std::span< char, socket::SOCKADDR_STR_LEN > buf) const
Get peer name (IP address) into caller-provided buffer, returns buf for convenience.
const char * get_name() const
bool send_message(const T &msg)
enums::VoiceAssistantEvent event_type
Definition api_pb2.h:2426
std::vector< VoiceAssistantEventData > data
Definition api_pb2.h:2427
VoiceAssistantAudioSettings audio_settings
Definition api_pb2.h:2382
enums::VoiceAssistantTimerEvent event_type
Definition api_pb2.h:2465
static std::unique_ptr< RingBufferAudioSource > create(std::shared_ptr< ring_buffer::RingBuffer > ring_buffer, size_t max_fill_bytes, uint8_t alignment_bytes=1)
Creates a new ring-buffer-backed audio source after validating its parameters.
MediaPlayerCall & set_media_url(const std::string &url)
MediaPlayerCall & set_announcement(bool announce)
MediaPlayerCall & set_command(MediaPlayerCommand command)
void add_on_state_callback(F &&callback)
std::vector< WakeWordModel * > get_wake_words()
void add_data_callback(F &&data_callback)
static std::unique_ptr< RingBuffer > create(size_t len, MemoryPreference preference=MemoryPreference::EXTERNAL_FIRST)
virtual size_t play(const uint8_t *data, size_t length)=0
Plays the provided audio data.
bool is_running() const
Definition speaker.h:65
virtual bool has_buffered_data() const =0
virtual void start()=0
virtual void stop()=0
std::unique_ptr< socket::Socket > socket_
microphone::MicrophoneSource * mic_source2_
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
std::unique_ptr< audio::RingBufferAudioSource > audio_source_
std::weak_ptr< ring_buffer::RingBuffer > ring_buffer2_
std::weak_ptr< ring_buffer::RingBuffer > ring_buffer_
media_player::MediaPlayer * media_player_
void client_subscription(api::APIConnection *client, bool subscribe)
MediaPlayerResponseState media_player_response_state_
void on_event(const api::VoiceAssistantEventResponse &msg)
Trigger< std::string, std::string > error_trigger_
Trigger< const std::vector< Timer > & > timer_tick_trigger_
Trigger< std::string > intent_progress_trigger_
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
void request_start(bool continuous, bool silence_detection)
void handle_channel_stall_(size_t available, size_t available2)
std::unique_ptr< audio::RingBufferAudioSource > audio_source2_
microphone::MicrophoneSource * mic_source_
micro_wake_word::MicroWakeWord * micro_wake_word_
void on_set_configuration(const std::vector< std::string > &active_wake_words)
const LogString * message
Definition component.cpp:35
uint16_t flags
bool state
Definition fan.h:2
uint32_t socklen_t
Definition headers.h:99
__int64 ssize_t
Definition httplib.h:178
@ VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD
Definition api_pb2.h:254
@ VOICE_ASSISTANT_REQUEST_USE_VAD
Definition api_pb2.h:253
@ VOICE_ASSISTANT_TIMER_UPDATED
Definition api_pb2.h:277
@ VOICE_ASSISTANT_TIMER_STARTED
Definition api_pb2.h:276
@ VOICE_ASSISTANT_TIMER_FINISHED
Definition api_pb2.h:279
@ VOICE_ASSISTANT_TIMER_CANCELLED
Definition api_pb2.h:278
@ VOICE_ASSISTANT_WAKE_WORD_START
Definition api_pb2.h:267
@ VOICE_ASSISTANT_TTS_STREAM_END
Definition api_pb2.h:272
@ VOICE_ASSISTANT_STT_VAD_START
Definition api_pb2.h:269
@ VOICE_ASSISTANT_INTENT_PROGRESS
Definition api_pb2.h:273
@ VOICE_ASSISTANT_TTS_STREAM_START
Definition api_pb2.h:271
@ VOICE_ASSISTANT_WAKE_WORD_END
Definition api_pb2.h:268
constexpr float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Definition component.h:55
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
Definition socket.cpp:146
VoiceAssistant * global_voice_assistant
int written
Definition helpers.h:1045
Application App
Global storage of Application pointer - only one Application can exist.
static void uint32_t
std::vector< WakeWord > available_wake_words
std::vector< std::string > active_wake_words
static constexpr size_t TO_STR_BUFFER_SIZE
Buffer size for to_str() - sufficient for typical timer names.
std::vector< std::string > trained_languages
sa_family_t ss_family
Definition headers.h:94