ESPHome 2026.5.0-dev
Loading...
Searching...
No Matches
voice_assistant.cpp
Go to the documentation of this file.
1#include "voice_assistant.h"
3
4#ifdef USE_VOICE_ASSISTANT
5
7#include "esphome/core/log.h"
8
9#include <cinttypes>
10#include <cstdio>
11
12namespace esphome {
13namespace voice_assistant {
14
15static const char *const TAG = "voice_assistant";
16
17#ifdef SAMPLE_RATE_HZ
18#undef SAMPLE_RATE_HZ
19#endif
20
21static const size_t SAMPLE_RATE_HZ = 16000;
22
23static const size_t RING_BUFFER_SAMPLES = 512 * SAMPLE_RATE_HZ / 1000; // 512 ms * 16 kHz/ 1000 ms
24static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES * sizeof(int16_t);
25static const size_t SEND_BUFFER_SAMPLES = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
26static const size_t SEND_BUFFER_SIZE = SEND_BUFFER_SAMPLES * sizeof(int16_t);
27static const size_t RECEIVE_SIZE = 1024;
28static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
29
31
33 this->mic_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
34 std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
35 if (this->ring_buffer_.use_count() > 1) {
36 temp_ring_buffer->write((void *) data.data(), data.size());
37 }
38 });
39
40#ifdef USE_MEDIA_PLAYER
41 if (this->media_player_ != nullptr) {
43 switch (state) {
46 // State changed to announcing after receiving the url
48 }
49 break;
50 default:
52 // No longer announcing the TTS response
54 }
55 break;
56 }
57 });
58 }
59#endif
60}
61
63
65 this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
66 if (this->socket_ == nullptr) {
67 ESP_LOGE(TAG, "Could not create socket");
68 this->mark_failed();
69 return false;
70 }
71 int enable = 1;
72 int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
73 if (err != 0) {
74 ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
75 // we can still continue
76 }
77 err = this->socket_->setblocking(false);
78 if (err != 0) {
79 ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
80 this->mark_failed();
81 return false;
82 }
83
84#ifdef USE_SPEAKER
85 if (this->speaker_ != nullptr) {
86 struct sockaddr_storage server;
87
88 socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
89 if (sl == 0) {
90 ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
91 this->mark_failed();
92 return false;
93 }
94
95 err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
96 if (err != 0) {
97 ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
98 this->mark_failed();
99 return false;
100 }
101 }
102#endif
103 this->udp_socket_running_ = true;
104 return true;
105}
106
108#ifdef USE_SPEAKER
109 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ == nullptr)) {
110 RAMAllocator<uint8_t> speaker_allocator;
111 this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
112 if (this->speaker_buffer_ == nullptr) {
113 ESP_LOGW(TAG, "Could not allocate speaker buffer");
114 return false;
115 }
116 }
117#endif
118
119 if (this->ring_buffer_.use_count() == 0) {
120 this->ring_buffer_ = RingBuffer::create(RING_BUFFER_SIZE);
121 if (this->ring_buffer_.use_count() == 0) {
122 ESP_LOGE(TAG, "Could not allocate ring buffer");
123 return false;
124 }
125 }
126
127 if (this->send_buffer_ == nullptr) {
128 RAMAllocator<uint8_t> send_allocator;
129 this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
130 if (send_buffer_ == nullptr) {
131 ESP_LOGW(TAG, "Could not allocate send buffer");
132 return false;
133 }
134 }
135
136 return true;
137}
138
140 if (this->send_buffer_ != nullptr) {
141 memset(this->send_buffer_, 0, SEND_BUFFER_SIZE);
142 }
143
144 if (this->ring_buffer_ != nullptr) {
145 this->ring_buffer_->reset();
146 }
147
148#ifdef USE_SPEAKER
149 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
150 memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
151
152 this->speaker_buffer_size_ = 0;
153 this->speaker_buffer_index_ = 0;
154 this->speaker_bytes_received_ = 0;
155 }
156#endif
157}
158
160 if (this->send_buffer_ != nullptr) {
161 RAMAllocator<uint8_t> send_deallocator;
162 send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
163 this->send_buffer_ = nullptr;
164 }
165
166 if (this->ring_buffer_.use_count() > 0) {
167 this->ring_buffer_.reset();
168 }
169
170#ifdef USE_SPEAKER
171 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
172 RAMAllocator<uint8_t> speaker_deallocator;
173 speaker_deallocator.deallocate(this->speaker_buffer_, SPEAKER_BUFFER_SIZE);
174 this->speaker_buffer_ = nullptr;
175 }
176#endif
177}
178
180 this->conversation_id_ = "";
181 ESP_LOGD(TAG, "reset conversation ID");
182}
183
185 if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
187 if (this->mic_source_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
189 } else {
191 }
192 this->continuous_ = false;
193 this->signal_stop_();
194 this->clear_buffers_();
195 return;
196 }
197 switch (this->state_) {
198 case State::IDLE: {
199 if (this->continuous_ && this->desired_state_ == State::IDLE) {
200 this->idle_trigger_.trigger();
202 } else {
203 this->deallocate_buffers_();
204 }
205 break;
206 }
208 ESP_LOGD(TAG, "Starting Microphone");
209 if (!this->allocate_buffers_()) {
210 this->status_set_error(LOG_STR("Failed to allocate buffers"));
211 return;
212 }
213 if (this->status_has_error()) {
214 this->status_clear_error();
215 }
216 this->clear_buffers_();
217
218 this->mic_source_->start();
220 break;
221 }
223 if (this->mic_source_->is_running()) {
224 this->set_state_(this->desired_state_);
225 }
226 break;
227 }
229 ESP_LOGD(TAG, "Requesting start");
230 uint32_t flags = 0;
231 if (!this->continue_conversation_ && this->use_wake_word_)
233 if (this->silence_detection_)
237 audio_settings.auto_gain = this->auto_gain_;
238 audio_settings.volume_multiplier = this->volume_multiplier_;
239
241 msg.start = true;
243 msg.flags = flags;
244 msg.audio_settings = audio_settings;
246
247 // Reset media player state tracking
248#ifdef USE_MEDIA_PLAYER
249 if (this->media_player_ != nullptr) {
251 }
252#endif
253
254 if (this->api_client_ == nullptr || !this->api_client_->send_message(msg)) {
255 ESP_LOGW(TAG, "Could not request start");
256 this->error_trigger_.trigger("not-connected", "Could not request start");
257 this->continuous_ = false;
259 break;
260 }
262 this->set_timeout("reset-conversation_id", this->conversation_timeout_,
263 [this]() { this->reset_conversation_id(); });
264 break;
265 }
267 break; // State changed when udp server port received
268 }
270 size_t available = this->ring_buffer_->available();
271 while (available >= SEND_BUFFER_SIZE) {
272 size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
273 if (this->audio_mode_ == AUDIO_MODE_API) {
275 msg.data = this->send_buffer_;
276 msg.data_len = read_bytes;
277 this->api_client_->send_message(msg);
278 } else {
279 if (!this->udp_socket_running_) {
280 if (!this->start_udp_socket_()) {
282 break;
283 }
284 }
285 this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
286 sizeof(this->dest_addr_));
287 }
288 available = this->ring_buffer_->available();
289 }
290
291 break;
292 }
294 if (this->mic_source_->is_running()) {
295 this->mic_source_->stop();
297 } else {
298 this->set_state_(this->desired_state_);
299 }
300 break;
301 }
303 if (this->mic_source_->is_stopped()) {
304 this->set_state_(this->desired_state_);
305 }
306 break;
307 }
309 break; // State changed by events
310 }
312 bool playing = false;
313#ifdef USE_SPEAKER
314 if (this->speaker_ != nullptr) {
315 ssize_t received_len = 0;
316 if (this->audio_mode_ == AUDIO_MODE_UDP) {
317 if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
318 received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
319 if (received_len > 0) {
320 this->speaker_buffer_index_ += received_len;
321 this->speaker_buffer_size_ += received_len;
322 this->speaker_bytes_received_ += received_len;
323 }
324 } else {
325 ESP_LOGD(TAG, "Receive buffer full");
326 }
327 }
328 // Build a small buffer of audio before sending to the speaker
329 bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
330 if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
331 this->write_speaker_();
332 if (this->wait_for_stream_end_) {
333 this->cancel_timeout("playing");
334 if (end_of_stream) {
335 ESP_LOGD(TAG, "End of audio stream received");
336 this->cancel_timeout("speaker-timeout");
338 }
339 break; // We dont want to timeout here as the STREAM_END event will take care of that.
340 }
341 playing = this->speaker_->is_running();
342 }
343#endif
344#ifdef USE_MEDIA_PLAYER
345 if (this->media_player_ != nullptr) {
347
350 this->cancel_timeout("playing");
351 ESP_LOGD(TAG, "Announcement finished playing");
353
355 msg.success = true;
356 this->api_client_->send_message(msg);
357 break;
358 }
359 }
360#endif
361 if (playing) {
363 }
364 break;
365 }
367#ifdef USE_SPEAKER
368 if (this->speaker_ != nullptr) {
369 if (this->speaker_buffer_size_ > 0) {
370 this->write_speaker_();
371 break;
372 }
373 if (this->speaker_->has_buffered_data() || this->speaker_->is_running()) {
374 break;
375 }
376 ESP_LOGD(TAG, "Speaker has finished outputting all audio");
377 this->speaker_->stop();
378 this->cancel_timeout("speaker-timeout");
379 this->cancel_timeout("playing");
380
381 this->clear_buffers_();
382
383 this->wait_for_stream_end_ = false;
384 this->stream_ended_ = false;
385
387 }
388#endif
389 if (this->continue_conversation_) {
391 } else {
393 }
394 break;
395 }
396 default:
397 break;
398 }
399}
400
401#ifdef USE_SPEAKER
403 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
404 if (this->speaker_buffer_size_ > 0) {
405 size_t write_chunk = std::min<size_t>(this->speaker_buffer_size_, 4 * 1024);
406 size_t written = this->speaker_->play(this->speaker_buffer_, write_chunk);
407 if (written > 0) {
408 memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written);
411 this->set_timeout("speaker-timeout", 5000, [this]() { this->speaker_->stop(); });
412 } else {
413 ESP_LOGV(TAG, "Speaker buffer full, trying again next loop");
414 }
415 }
416 }
417}
418#endif
419
421 if (!subscribe) {
422 if (this->api_client_ == nullptr || client != this->api_client_) {
423 ESP_LOGE(TAG, "Client attempting to unsubscribe that is not the current API Client");
424 return;
425 }
426 this->api_client_ = nullptr;
428 return;
429 }
430
431 if (this->api_client_ != nullptr) {
432 char current_peername[socket::SOCKADDR_STR_LEN];
433 char new_peername[socket::SOCKADDR_STR_LEN];
434 ESP_LOGE(TAG,
435 "Multiple API Clients attempting to connect to Voice Assistant\n"
436 " Current client: %s (%s)\n"
437 " New client: %s (%s)",
438 this->api_client_->get_name(), this->api_client_->get_peername_to(current_peername), client->get_name(),
439 client->get_peername_to(new_peername));
440 return;
441 }
442
443 this->api_client_ = client;
445}
446
447static const LogString *voice_assistant_state_to_string(State state) {
448 switch (state) {
449 case State::IDLE:
450 return LOG_STR("IDLE");
452 return LOG_STR("START_MICROPHONE");
454 return LOG_STR("STARTING_MICROPHONE");
456 return LOG_STR("WAIT_FOR_VAD");
458 return LOG_STR("WAITING_FOR_VAD");
460 return LOG_STR("START_PIPELINE");
462 return LOG_STR("STARTING_PIPELINE");
464 return LOG_STR("STREAMING_MICROPHONE");
466 return LOG_STR("STOP_MICROPHONE");
468 return LOG_STR("STOPPING_MICROPHONE");
470 return LOG_STR("AWAITING_RESPONSE");
472 return LOG_STR("STREAMING_RESPONSE");
474 return LOG_STR("RESPONSE_FINISHED");
475 default:
476 return LOG_STR("UNKNOWN");
477 }
478};
479
481 State old_state = this->state_;
482 this->state_ = state;
483 ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
484 LOG_STR_ARG(voice_assistant_state_to_string(state)));
485}
486
488 this->set_state_(state);
489 this->desired_state_ = desired_state;
490 ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
491}
492
494 ESP_LOGE(TAG, "Failed to start server. See Home Assistant logs for more details.");
495 this->error_trigger_.trigger("failed-to-start", "Failed to start server. See Home Assistant logs for more details.");
497}
498
500 if (this->state_ != State::STARTING_PIPELINE) {
501 this->signal_stop_();
502 return;
503 }
504
505 ESP_LOGD(TAG, "Client started, streaming microphone");
507
508 if (this->mic_source_->is_running()) {
510 } else {
512 }
513}
514
515void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
516 if (this->state_ != State::STARTING_PIPELINE) {
517 this->signal_stop_();
518 return;
519 }
520
521 ESP_LOGD(TAG, "Client started, streaming microphone");
523
524 memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
525 if (this->dest_addr_.ss_family == AF_INET) {
526 ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
527 }
528#if LWIP_IPV6
529 else if (this->dest_addr_.ss_family == AF_INET6) {
530 ((struct sockaddr_in6 *) &this->dest_addr_)->sin6_port = htons(port);
531 }
532#endif
533 else {
534 ESP_LOGW(TAG, "Unknown address family: %d", this->dest_addr_.ss_family);
535 return;
536 }
537
538 if (this->mic_source_->is_running()) {
540 } else {
542 }
543}
544
545void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
546 if (this->api_client_ == nullptr) {
547 ESP_LOGE(TAG, "No API client connected");
549 this->continuous_ = false;
550 return;
551 }
552 if (this->state_ == State::IDLE) {
553 this->continuous_ = continuous;
554 this->silence_detection_ = silence_detection;
555
557 }
558}
559
561 this->continuous_ = false;
562 this->continue_conversation_ = false;
563
564 switch (this->state_) {
565 case State::IDLE:
566 break;
573 break;
576 this->signal_stop_();
578 break;
582 break;
584 this->signal_stop_();
585 break;
587#ifdef USE_MEDIA_PLAYER
588 // Stop any ongoing media player announcement
589 if (this->media_player_ != nullptr) {
590 this->media_player_->make_call()
592 .set_announcement(true)
593 .perform();
594 }
595 if (this->started_streaming_tts_) {
596 // Haven't reached the TTS_END stage, so send the stop signal to HA.
597 this->signal_stop_();
598 }
599#endif
600 break;
602 break; // Let the incoming audio stream finish then it will go to idle.
603 }
604}
605
607 memset(&this->dest_addr_, 0, sizeof(this->dest_addr_));
608 if (this->api_client_ == nullptr) {
609 return;
610 }
611 ESP_LOGD(TAG, "Signaling stop");
613 msg.start = false;
614 this->api_client_->send_message(msg);
615}
616
618 this->set_timeout("playing", 2000, [this]() {
619 this->cancel_timeout("speaker-timeout");
621
622 if (this->api_client_ == nullptr)
623 return;
625 msg.success = true;
626 this->api_client_->send_message(msg);
627 });
628}
629
631 ESP_LOGD(TAG, "Event Type: %" PRId32, msg.event_type);
632 switch (msg.event_type) {
634 ESP_LOGD(TAG, "Assist Pipeline running");
635#ifdef USE_MEDIA_PLAYER
636 this->started_streaming_tts_ = false;
637 for (const auto &arg : msg.data) {
638 if (arg.name == "url") {
639 this->tts_response_url_ = arg.value;
640 }
641 }
642#endif
643 this->defer([this]() { this->start_trigger_.trigger(); });
644 break;
646 break;
648 ESP_LOGD(TAG, "Wake word detected");
649 this->defer([this]() { this->wake_word_detected_trigger_.trigger(); });
650 break;
651 }
653 ESP_LOGD(TAG, "STT started");
654 this->defer([this]() { this->listening_trigger_.trigger(); });
655 break;
657 std::string text;
658 for (const auto &arg : msg.data) {
659 if (arg.name == "text") {
660 text = arg.value;
661 }
662 }
663 if (text.empty()) {
664 ESP_LOGW(TAG, "No text in STT_END event");
665 return;
666 } else if (text.length() > 500) {
667 text.resize(497);
668 text += "...";
669 }
670 ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
671 this->defer([this, text]() { this->stt_end_trigger_.trigger(text); });
672 break;
673 }
675 ESP_LOGD(TAG, "Intent started");
676 this->defer([this]() { this->intent_start_trigger_.trigger(); });
677 break;
679 ESP_LOGD(TAG, "Intent progress");
680 std::string tts_url_for_trigger = "";
681#ifdef USE_MEDIA_PLAYER
682 if (this->media_player_ != nullptr) {
683 for (const auto &arg : msg.data) {
684 if ((arg.name == "tts_start_streaming") && (arg.value == "1") && !this->tts_response_url_.empty()) {
686
688
689 this->started_streaming_tts_ = true;
691
692 tts_url_for_trigger = this->tts_response_url_;
693 this->tts_response_url_.clear(); // Reset streaming URL
695 }
696 }
697 }
698#endif
699 this->defer([this, tts_url_for_trigger]() { this->intent_progress_trigger_.trigger(tts_url_for_trigger); });
700 break;
701 }
703 for (const auto &arg : msg.data) {
704 if (arg.name == "conversation_id") {
705 this->conversation_id_ = arg.value;
706 } else if (arg.name == "continue_conversation") {
707 this->continue_conversation_ = (arg.value == "1");
708 }
709 }
710 this->defer([this]() { this->intent_end_trigger_.trigger(); });
711 break;
712 }
714 std::string text;
715 for (const auto &arg : msg.data) {
716 if (arg.name == "text") {
717 text = arg.value;
718 }
719 }
720 if (text.empty()) {
721 ESP_LOGW(TAG, "No text in TTS_START event");
722 return;
723 }
724 if (text.length() > 500) {
725 text.resize(497);
726 text += "...";
727 }
728 ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
729 this->defer([this, text]() {
730 this->tts_start_trigger_.trigger(text);
731#ifdef USE_SPEAKER
732 if (this->speaker_ != nullptr) {
733 this->speaker_->start();
734 }
735#endif
736 });
737 break;
738 }
740 std::string url;
741 for (const auto &arg : msg.data) {
742 if (arg.name == "url") {
743 url = arg.value;
744 }
745 }
746 if (url.empty()) {
747 ESP_LOGW(TAG, "No url in TTS_END event");
748 return;
749 }
750 ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
751 this->defer([this, url]() {
752#ifdef USE_MEDIA_PLAYER
753 if ((this->media_player_ != nullptr) && (!this->started_streaming_tts_)) {
755
757
759 }
760 this->started_streaming_tts_ = false; // Helps indicate reaching the TTS_END stage
761#endif
762 this->tts_end_trigger_.trigger(url);
763 });
765 if (new_state != this->state_) {
766 // Don't needlessly change the state. The intent progress stage may have already changed the state to streaming
767 // response.
768 this->set_state_(new_state, new_state);
769 }
770 break;
771 }
773 ESP_LOGD(TAG, "Assist Pipeline ended");
774 if ((this->state_ == State::START_PIPELINE) || (this->state_ == State::STARTING_PIPELINE) ||
776 // Microphone is running, stop it
778 } else if (this->state_ == State::AWAITING_RESPONSE) {
779 // No TTS start event ("nevermind")
781 }
782 this->defer([this]() { this->end_trigger_.trigger(); });
783 break;
784 }
786 std::string code = "";
787 std::string message = "";
788 for (const auto &arg : msg.data) {
789 if (arg.name == "code") {
790 code = arg.value;
791 } else if (arg.name == "message") {
792 message = arg.value;
793 }
794 }
795 if (code == "wake-word-timeout" || code == "wake_word_detection_aborted" || code == "no_wake_word") {
796 // Don't change state here since either the "tts-end" or "run-end" events will do it.
797 return;
798 } else if (code == "wake-provider-missing" || code == "wake-engine-missing") {
799 // Wake word is not set up or not ready on Home Assistant so stop and do not retry until user starts again.
800 this->defer([this, code, message]() {
801 this->request_stop();
802 this->error_trigger_.trigger(code, message);
803 });
804 return;
805 }
806 ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str());
807 if (this->state_ != State::IDLE) {
808 this->signal_stop_();
810 }
811 this->defer([this, code, message]() { this->error_trigger_.trigger(code, message); });
812 break;
813 }
815#ifdef USE_SPEAKER
816 if (this->speaker_ != nullptr) {
817 this->wait_for_stream_end_ = true;
818 ESP_LOGD(TAG, "TTS stream start");
819 this->defer([this] { this->tts_stream_start_trigger_.trigger(); });
820 }
821#endif
822 break;
823 }
825#ifdef USE_SPEAKER
826 if (this->speaker_ != nullptr) {
827 this->stream_ended_ = true;
828 ESP_LOGD(TAG, "TTS stream end");
829 }
830#endif
831 break;
832 }
834 ESP_LOGD(TAG, "Starting STT by VAD");
835 this->defer([this]() { this->stt_vad_start_trigger_.trigger(); });
836 break;
838 ESP_LOGD(TAG, "STT by VAD end");
840 this->defer([this]() { this->stt_vad_end_trigger_.trigger(); });
841 break;
842 default:
843 ESP_LOGD(TAG, "Unhandled event type: %" PRId32, msg.event_type);
844 break;
845 }
846}
847
849#ifdef USE_SPEAKER // We should never get to this function if there is no speaker anyway
850 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
851 if (this->speaker_buffer_index_ + msg.data_len < SPEAKER_BUFFER_SIZE) {
852 memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data, msg.data_len);
853 this->speaker_buffer_index_ += msg.data_len;
854 this->speaker_buffer_size_ += msg.data_len;
856 ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data_len);
857 } else {
858 ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
859 }
860 }
861#endif
862}
863
865 // Find existing timer or add a new one
866 auto it = this->timers_.begin();
867 for (; it != this->timers_.end(); ++it) {
868 if (it->id == msg.timer_id)
869 break;
870 }
871 if (it == this->timers_.end()) {
872 this->timers_.push_back({});
873 it = this->timers_.end() - 1;
874 }
875 it->id = msg.timer_id;
876 it->name = msg.name;
877 it->total_seconds = msg.total_seconds;
878 it->seconds_left = msg.seconds_left;
879 it->is_active = msg.is_active;
880
881 char timer_buf[Timer::TO_STR_BUFFER_SIZE];
882 ESP_LOGD(TAG,
883 "Timer Event\n"
884 " Type: %" PRId32 "\n"
885 " %s",
886 msg.event_type, it->to_str(timer_buf));
887
888 switch (msg.event_type) {
890 this->timer_started_trigger_.trigger(*it);
891 break;
893 this->timer_updated_trigger_.trigger(*it);
894 break;
896 this->timer_cancelled_trigger_.trigger(*it);
897 this->timers_.erase(it);
898 break;
900 this->timer_finished_trigger_.trigger(*it);
901 this->timers_.erase(it);
902 break;
903 }
904
905 if (this->timers_.empty()) {
906 this->cancel_interval("timer-event");
907 this->timer_tick_running_ = false;
908 } else if (!this->timer_tick_running_) {
909 this->set_interval("timer-event", 1000, [this]() { this->timer_tick_(); });
910 this->timer_tick_running_ = true;
911 }
912}
913
915 for (auto &timer : this->timers_) {
916 if (timer.is_active && timer.seconds_left > 0) {
917 timer.seconds_left--;
918 }
919 }
920 this->timer_tick_trigger_.trigger(this->timers_);
921}
922
924#ifdef USE_MEDIA_PLAYER
925 if (this->media_player_ != nullptr) {
927
929
930 if (!msg.preannounce_media_id.empty()) {
932 }
933 // Enqueueing a URL with an empty playlist will still play the file immediately
934 this->media_player_->make_call()
937 .set_announcement(true)
938 .perform();
940
942
943 if (this->continuous_) {
945 } else {
947 }
948
950 this->end_trigger_.trigger();
951 }
952#endif
953}
954
955void VoiceAssistant::on_set_configuration(const std::vector<std::string> &active_wake_words) {
956#ifdef USE_MICRO_WAKE_WORD
957 if (this->micro_wake_word_) {
958 // Disable all wake words first
959 for (auto &model : this->micro_wake_word_->get_wake_words()) {
960 model->disable();
961 }
962
963 // Enable only active wake words
964 for (const auto &ww_id : active_wake_words) {
965 for (auto &model : this->micro_wake_word_->get_wake_words()) {
966 if (model->get_id() == ww_id) {
967 model->enable();
968 ESP_LOGD(TAG, "Enabled wake word: %s (id=%s)", model->get_wake_word().c_str(), model->get_id().c_str());
969 }
970 }
971 }
972 }
973#endif
974};
975
977 this->config_.available_wake_words.clear();
978 this->config_.active_wake_words.clear();
979
980#ifdef USE_MICRO_WAKE_WORD
981 if (this->micro_wake_word_) {
983
984 for (auto &model : this->micro_wake_word_->get_wake_words()) {
985 if (model->is_enabled()) {
986 this->config_.active_wake_words.push_back(model->get_id());
987 }
988
989 WakeWord wake_word;
990 wake_word.id = model->get_id();
991 wake_word.wake_word = model->get_wake_word();
992 for (const auto &lang : model->get_trained_languages()) {
993 wake_word.trained_languages.push_back(lang);
994 }
995 this->config_.available_wake_words.push_back(std::move(wake_word));
996 }
997 } else {
998#endif
999 // No microWakeWord
1001#ifdef USE_MICRO_WAKE_WORD
1002 }
1003#endif
1004
1005 return this->config_;
1006};
1007
1008VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
1009
1010} // namespace voice_assistant
1011} // namespace esphome
1012
1013#endif // USE_VOICE_ASSISTANT
void mark_failed()
Mark this component as failed.
ESPDEPRECATED("Use const char* overload instead. Removed in 2026.7.0", "2026.1.0") void defer(const std voi defer)(const char *name, std::function< void()> &&f)
Defer a callback to the next loop() call.
Definition component.h:560
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") void set_timeout(const std voi set_timeout)(const char *name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
Definition component.h:510
void status_clear_error()
Definition component.h:312
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") void set_interval(const std voi set_interval)(const char *name, uint32_t interval, std::function< void()> &&f)
Set an interval function with a unique name.
Definition component.h:417
bool status_has_error() const
Definition component.h:292
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") bool cancel_timeout(const std boo cancel_timeout)(const char *name)
Cancel a timeout function.
Definition component.h:532
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") bool cancel_interval(const std boo cancel_interval)(const char *name)
Cancel an interval function.
Definition component.h:439
An STL allocator that uses SPI or internal RAM.
Definition helpers.h:2212
void deallocate(T *p, size_t n)
Definition helpers.h:2267
T * allocate(size_t n)
Definition helpers.h:2229
static std::unique_ptr< RingBuffer > create(size_t len)
StringRef is a reference to a string owned by something else.
Definition string_ref.h:26
constexpr bool empty() const
Definition string_ref.h:76
void trigger(const Ts &...x) ESPHOME_ALWAYS_INLINE
Inform the parent automation that the event has triggered.
Definition automation.h:482
const char * get_peername_to(std::span< char, socket::SOCKADDR_STR_LEN > buf) const
Get peer name (IP address) into caller-provided buffer, returns buf for convenience.
const char * get_name() const
bool send_message(const T &msg)
enums::VoiceAssistantEvent event_type
Definition api_pb2.h:2417
std::vector< VoiceAssistantEventData > data
Definition api_pb2.h:2418
VoiceAssistantAudioSettings audio_settings
Definition api_pb2.h:2373
enums::VoiceAssistantTimerEvent event_type
Definition api_pb2.h:2454
MediaPlayerCall & set_media_url(const std::string &url)
MediaPlayerCall & set_announcement(bool announce)
MediaPlayerCall & set_command(MediaPlayerCommand command)
void add_on_state_callback(F &&callback)
std::vector< WakeWordModel * > get_wake_words()
void add_data_callback(F &&data_callback)
virtual size_t play(const uint8_t *data, size_t length)=0
Plays the provided audio data.
bool is_running() const
Definition speaker.h:66
virtual bool has_buffered_data() const =0
virtual void start()=0
virtual void stop()=0
std::unique_ptr< socket::Socket > socket_
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
media_player::MediaPlayer * media_player_
void client_subscription(api::APIConnection *client, bool subscribe)
MediaPlayerResponseState media_player_response_state_
std::shared_ptr< RingBuffer > ring_buffer_
void on_event(const api::VoiceAssistantEventResponse &msg)
Trigger< std::string, std::string > error_trigger_
Trigger< const std::vector< Timer > & > timer_tick_trigger_
Trigger< std::string > intent_progress_trigger_
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
void request_start(bool continuous, bool silence_detection)
microphone::MicrophoneSource * mic_source_
micro_wake_word::MicroWakeWord * micro_wake_word_
void on_set_configuration(const std::vector< std::string > &active_wake_words)
const char * message
Definition component.cpp:35
uint16_t flags
bool state
Definition fan.h:2
uint32_t socklen_t
Definition headers.h:99
__int64 ssize_t
Definition httplib.h:178
@ VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD
Definition api_pb2.h:247
@ VOICE_ASSISTANT_REQUEST_USE_VAD
Definition api_pb2.h:246
@ VOICE_ASSISTANT_TIMER_UPDATED
Definition api_pb2.h:270
@ VOICE_ASSISTANT_TIMER_STARTED
Definition api_pb2.h:269
@ VOICE_ASSISTANT_TIMER_FINISHED
Definition api_pb2.h:272
@ VOICE_ASSISTANT_TIMER_CANCELLED
Definition api_pb2.h:271
@ VOICE_ASSISTANT_WAKE_WORD_START
Definition api_pb2.h:260
@ VOICE_ASSISTANT_TTS_STREAM_END
Definition api_pb2.h:265
@ VOICE_ASSISTANT_STT_VAD_START
Definition api_pb2.h:262
@ VOICE_ASSISTANT_INTENT_PROGRESS
Definition api_pb2.h:266
@ VOICE_ASSISTANT_TTS_STREAM_START
Definition api_pb2.h:264
@ VOICE_ASSISTANT_WAKE_WORD_END
Definition api_pb2.h:261
constexpr float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Definition component.h:54
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
Definition socket.cpp:143
VoiceAssistant * global_voice_assistant
Providing packet encoding functions for exchanging data with a remote host.
Definition a01nyub.cpp:7
int written
Definition helpers.h:1089
static void uint32_t
std::vector< WakeWord > available_wake_words
std::vector< std::string > active_wake_words
static constexpr size_t TO_STR_BUFFER_SIZE
Buffer size for to_str() - sufficient for typical timer names.
std::vector< std::string > trained_languages
sa_family_t ss_family
Definition headers.h:94