ESPHome 2025.12.0-dev
Loading...
Searching...
No Matches
voice_assistant.cpp
Go to the documentation of this file.
1#include "voice_assistant.h"
3
4#ifdef USE_VOICE_ASSISTANT
5
6#include "esphome/core/log.h"
7
8#include <cinttypes>
9#include <cstdio>
10
11namespace esphome {
12namespace voice_assistant {
13
14static const char *const TAG = "voice_assistant";
15
16#ifdef SAMPLE_RATE_HZ
17#undef SAMPLE_RATE_HZ
18#endif
19
20static const size_t SAMPLE_RATE_HZ = 16000;
21
22static const size_t RING_BUFFER_SAMPLES = 512 * SAMPLE_RATE_HZ / 1000; // 512 ms * 16 kHz/ 1000 ms
23static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES * sizeof(int16_t);
24static const size_t SEND_BUFFER_SAMPLES = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
25static const size_t SEND_BUFFER_SIZE = SEND_BUFFER_SAMPLES * sizeof(int16_t);
26static const size_t RECEIVE_SIZE = 1024;
27static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
28
30
32 this->mic_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
33 std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
34 if (this->ring_buffer_.use_count() > 1) {
35 temp_ring_buffer->write((void *) data.data(), data.size());
36 }
37 });
38
39#ifdef USE_MEDIA_PLAYER
40 if (this->media_player_ != nullptr) {
41 this->media_player_->add_on_state_callback([this]() {
42 switch (this->media_player_->state) {
45 // State changed to announcing after receiving the url
47 }
48 break;
49 default:
51 // No longer announcing the TTS response
53 }
54 break;
55 }
56 });
57 }
58#endif
59}
60
62
64 this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
65 if (this->socket_ == nullptr) {
66 ESP_LOGE(TAG, "Could not create socket");
67 this->mark_failed();
68 return false;
69 }
70 int enable = 1;
71 int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
72 if (err != 0) {
73 ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
74 // we can still continue
75 }
76 err = this->socket_->setblocking(false);
77 if (err != 0) {
78 ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
79 this->mark_failed();
80 return false;
81 }
82
83#ifdef USE_SPEAKER
84 if (this->speaker_ != nullptr) {
85 struct sockaddr_storage server;
86
87 socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
88 if (sl == 0) {
89 ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
90 this->mark_failed();
91 return false;
92 }
93
94 err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
95 if (err != 0) {
96 ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
97 this->mark_failed();
98 return false;
99 }
100 }
101#endif
102 this->udp_socket_running_ = true;
103 return true;
104}
105
107#ifdef USE_SPEAKER
108 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ == nullptr)) {
109 RAMAllocator<uint8_t> speaker_allocator;
110 this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
111 if (this->speaker_buffer_ == nullptr) {
112 ESP_LOGW(TAG, "Could not allocate speaker buffer");
113 return false;
114 }
115 }
116#endif
117
118 if (this->ring_buffer_.use_count() == 0) {
119 this->ring_buffer_ = RingBuffer::create(RING_BUFFER_SIZE);
120 if (this->ring_buffer_.use_count() == 0) {
121 ESP_LOGE(TAG, "Could not allocate ring buffer");
122 return false;
123 }
124 }
125
126 if (this->send_buffer_ == nullptr) {
127 RAMAllocator<uint8_t> send_allocator;
128 this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
129 if (send_buffer_ == nullptr) {
130 ESP_LOGW(TAG, "Could not allocate send buffer");
131 return false;
132 }
133 }
134
135 return true;
136}
137
139 if (this->send_buffer_ != nullptr) {
140 memset(this->send_buffer_, 0, SEND_BUFFER_SIZE);
141 }
142
143 if (this->ring_buffer_ != nullptr) {
144 this->ring_buffer_->reset();
145 }
146
147#ifdef USE_SPEAKER
148 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
149 memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
150
151 this->speaker_buffer_size_ = 0;
152 this->speaker_buffer_index_ = 0;
153 this->speaker_bytes_received_ = 0;
154 }
155#endif
156}
157
159 if (this->send_buffer_ != nullptr) {
160 RAMAllocator<uint8_t> send_deallocator;
161 send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
162 this->send_buffer_ = nullptr;
163 }
164
165 if (this->ring_buffer_.use_count() > 0) {
166 this->ring_buffer_.reset();
167 }
168
169#ifdef USE_SPEAKER
170 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
171 RAMAllocator<uint8_t> speaker_deallocator;
172 speaker_deallocator.deallocate(this->speaker_buffer_, SPEAKER_BUFFER_SIZE);
173 this->speaker_buffer_ = nullptr;
174 }
175#endif
176}
177
179 this->conversation_id_ = "";
180 ESP_LOGD(TAG, "reset conversation ID");
181}
182
184 if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
186 if (this->mic_source_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
188 } else {
190 }
191 this->continuous_ = false;
192 this->signal_stop_();
193 this->clear_buffers_();
194 return;
195 }
196 switch (this->state_) {
197 case State::IDLE: {
198 if (this->continuous_ && this->desired_state_ == State::IDLE) {
199 this->idle_trigger_->trigger();
201 } else {
202 this->deallocate_buffers_();
203 }
204 break;
205 }
207 ESP_LOGD(TAG, "Starting Microphone");
208 if (!this->allocate_buffers_()) {
209 this->status_set_error("Failed to allocate buffers");
210 return;
211 }
212 if (this->status_has_error()) {
213 this->status_clear_error();
214 }
215 this->clear_buffers_();
216
217 this->mic_source_->start();
219 break;
220 }
222 if (this->mic_source_->is_running()) {
223 this->set_state_(this->desired_state_);
224 }
225 break;
226 }
228 ESP_LOGD(TAG, "Requesting start");
229 uint32_t flags = 0;
230 if (!this->continue_conversation_ && this->use_wake_word_)
232 if (this->silence_detection_)
236 audio_settings.auto_gain = this->auto_gain_;
237 audio_settings.volume_multiplier = this->volume_multiplier_;
238
240 msg.start = true;
242 msg.flags = flags;
243 msg.audio_settings = audio_settings;
245
246 // Reset media player state tracking
247#ifdef USE_MEDIA_PLAYER
248 if (this->media_player_ != nullptr) {
250 }
251#endif
252
253 if (this->api_client_ == nullptr ||
255 ESP_LOGW(TAG, "Could not request start");
256 this->error_trigger_->trigger("not-connected", "Could not request start");
257 this->continuous_ = false;
259 break;
260 }
262 this->set_timeout("reset-conversation_id", this->conversation_timeout_,
263 [this]() { this->reset_conversation_id(); });
264 break;
265 }
267 break; // State changed when udp server port received
268 }
270 size_t available = this->ring_buffer_->available();
271 while (available >= SEND_BUFFER_SIZE) {
272 size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
273 if (this->audio_mode_ == AUDIO_MODE_API) {
275 msg.set_data(this->send_buffer_, read_bytes);
277 } else {
278 if (!this->udp_socket_running_) {
279 if (!this->start_udp_socket_()) {
281 break;
282 }
283 }
284 this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
285 sizeof(this->dest_addr_));
286 }
287 available = this->ring_buffer_->available();
288 }
289
290 break;
291 }
293 if (this->mic_source_->is_running()) {
294 this->mic_source_->stop();
296 } else {
297 this->set_state_(this->desired_state_);
298 }
299 break;
300 }
302 if (this->mic_source_->is_stopped()) {
303 this->set_state_(this->desired_state_);
304 }
305 break;
306 }
308 break; // State changed by events
309 }
311 bool playing = false;
312#ifdef USE_SPEAKER
313 if (this->speaker_ != nullptr) {
314 ssize_t received_len = 0;
315 if (this->audio_mode_ == AUDIO_MODE_UDP) {
316 if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
317 received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
318 if (received_len > 0) {
319 this->speaker_buffer_index_ += received_len;
320 this->speaker_buffer_size_ += received_len;
321 this->speaker_bytes_received_ += received_len;
322 }
323 } else {
324 ESP_LOGD(TAG, "Receive buffer full");
325 }
326 }
327 // Build a small buffer of audio before sending to the speaker
328 bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
329 if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
330 this->write_speaker_();
331 if (this->wait_for_stream_end_) {
332 this->cancel_timeout("playing");
333 if (end_of_stream) {
334 ESP_LOGD(TAG, "End of audio stream received");
335 this->cancel_timeout("speaker-timeout");
337 }
338 break; // We dont want to timeout here as the STREAM_END event will take care of that.
339 }
340 playing = this->speaker_->is_running();
341 }
342#endif
343#ifdef USE_MEDIA_PLAYER
344 if (this->media_player_ != nullptr) {
346
349 this->cancel_timeout("playing");
350 ESP_LOGD(TAG, "Announcement finished playing");
352
354 msg.success = true;
356 break;
357 }
358 }
359#endif
360 if (playing) {
362 }
363 break;
364 }
366#ifdef USE_SPEAKER
367 if (this->speaker_ != nullptr) {
368 if (this->speaker_buffer_size_ > 0) {
369 this->write_speaker_();
370 break;
371 }
372 if (this->speaker_->has_buffered_data() || this->speaker_->is_running()) {
373 break;
374 }
375 ESP_LOGD(TAG, "Speaker has finished outputting all audio");
376 this->speaker_->stop();
377 this->cancel_timeout("speaker-timeout");
378 this->cancel_timeout("playing");
379
380 this->clear_buffers_();
381
382 this->wait_for_stream_end_ = false;
383 this->stream_ended_ = false;
384
386 }
387#endif
388 if (this->continue_conversation_) {
390 } else {
392 }
393 break;
394 }
395 default:
396 break;
397 }
398}
399
400#ifdef USE_SPEAKER
402 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
403 if (this->speaker_buffer_size_ > 0) {
404 size_t write_chunk = std::min<size_t>(this->speaker_buffer_size_, 4 * 1024);
405 size_t written = this->speaker_->play(this->speaker_buffer_, write_chunk);
406 if (written > 0) {
407 memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written);
408 this->speaker_buffer_size_ -= written;
409 this->speaker_buffer_index_ -= written;
410 this->set_timeout("speaker-timeout", 5000, [this]() { this->speaker_->stop(); });
411 } else {
412 ESP_LOGV(TAG, "Speaker buffer full, trying again next loop");
413 }
414 }
415 }
416}
417#endif
418
420 if (!subscribe) {
421 if (this->api_client_ == nullptr || client != this->api_client_) {
422 ESP_LOGE(TAG, "Client attempting to unsubscribe that is not the current API Client");
423 return;
424 }
425 this->api_client_ = nullptr;
427 return;
428 }
429
430 if (this->api_client_ != nullptr) {
431 ESP_LOGE(TAG, "Multiple API Clients attempting to connect to Voice Assistant");
432 ESP_LOGE(TAG, "Current client: %s (%s)", this->api_client_->get_name().c_str(),
433 this->api_client_->get_peername().c_str());
434 ESP_LOGE(TAG, "New client: %s (%s)", client->get_name().c_str(), client->get_peername().c_str());
435 return;
436 }
437
438 this->api_client_ = client;
440}
441
442static const LogString *voice_assistant_state_to_string(State state) {
443 switch (state) {
444 case State::IDLE:
445 return LOG_STR("IDLE");
447 return LOG_STR("START_MICROPHONE");
449 return LOG_STR("STARTING_MICROPHONE");
451 return LOG_STR("WAIT_FOR_VAD");
453 return LOG_STR("WAITING_FOR_VAD");
455 return LOG_STR("START_PIPELINE");
457 return LOG_STR("STARTING_PIPELINE");
459 return LOG_STR("STREAMING_MICROPHONE");
461 return LOG_STR("STOP_MICROPHONE");
463 return LOG_STR("STOPPING_MICROPHONE");
465 return LOG_STR("AWAITING_RESPONSE");
467 return LOG_STR("STREAMING_RESPONSE");
469 return LOG_STR("RESPONSE_FINISHED");
470 default:
471 return LOG_STR("UNKNOWN");
472 }
473};
474
476 State old_state = this->state_;
477 this->state_ = state;
478 ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
479 LOG_STR_ARG(voice_assistant_state_to_string(state)));
480}
481
483 this->set_state_(state);
484 this->desired_state_ = desired_state;
485 ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
486}
487
489 ESP_LOGE(TAG, "Failed to start server. See Home Assistant logs for more details.");
490 this->error_trigger_->trigger("failed-to-start", "Failed to start server. See Home Assistant logs for more details.");
492}
493
495 if (this->state_ != State::STARTING_PIPELINE) {
496 this->signal_stop_();
497 return;
498 }
499
500 ESP_LOGD(TAG, "Client started, streaming microphone");
502
503 if (this->mic_source_->is_running()) {
505 } else {
507 }
508}
509
510void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
511 if (this->state_ != State::STARTING_PIPELINE) {
512 this->signal_stop_();
513 return;
514 }
515
516 ESP_LOGD(TAG, "Client started, streaming microphone");
518
519 memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
520 if (this->dest_addr_.ss_family == AF_INET) {
521 ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
522 }
523#if LWIP_IPV6
524 else if (this->dest_addr_.ss_family == AF_INET6) {
525 ((struct sockaddr_in6 *) &this->dest_addr_)->sin6_port = htons(port);
526 }
527#endif
528 else {
529 ESP_LOGW(TAG, "Unknown address family: %d", this->dest_addr_.ss_family);
530 return;
531 }
532
533 if (this->mic_source_->is_running()) {
535 } else {
537 }
538}
539
540void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
541 if (this->api_client_ == nullptr) {
542 ESP_LOGE(TAG, "No API client connected");
544 this->continuous_ = false;
545 return;
546 }
547 if (this->state_ == State::IDLE) {
548 this->continuous_ = continuous;
549 this->silence_detection_ = silence_detection;
550
552 }
553}
554
556 this->continuous_ = false;
557 this->continue_conversation_ = false;
558
559 switch (this->state_) {
560 case State::IDLE:
561 break;
568 break;
571 this->signal_stop_();
573 break;
577 break;
579 this->signal_stop_();
580 break;
582#ifdef USE_MEDIA_PLAYER
583 // Stop any ongoing media player announcement
584 if (this->media_player_ != nullptr) {
585 this->media_player_->make_call()
587 .set_announcement(true)
588 .perform();
589 }
590 if (this->started_streaming_tts_) {
591 // Haven't reached the TTS_END stage, so send the stop signal to HA.
592 this->signal_stop_();
593 }
594#endif
595 break;
597 break; // Let the incoming audio stream finish then it will go to idle.
598 }
599}
600
602 memset(&this->dest_addr_, 0, sizeof(this->dest_addr_));
603 if (this->api_client_ == nullptr) {
604 return;
605 }
606 ESP_LOGD(TAG, "Signaling stop");
608 msg.start = false;
610}
611
613 this->set_timeout("playing", 2000, [this]() {
614 this->cancel_timeout("speaker-timeout");
616
618 msg.success = true;
620 });
621}
622
624 ESP_LOGD(TAG, "Event Type: %" PRId32, msg.event_type);
625 switch (msg.event_type) {
627 ESP_LOGD(TAG, "Assist Pipeline running");
628#ifdef USE_MEDIA_PLAYER
629 this->started_streaming_tts_ = false;
630 for (auto arg : msg.data) {
631 if (arg.name == "url") {
632 this->tts_response_url_ = std::move(arg.value);
633 }
634 }
635#endif
636 this->defer([this]() { this->start_trigger_->trigger(); });
637 break;
639 break;
641 ESP_LOGD(TAG, "Wake word detected");
642 this->defer([this]() { this->wake_word_detected_trigger_->trigger(); });
643 break;
644 }
646 ESP_LOGD(TAG, "STT started");
647 this->defer([this]() { this->listening_trigger_->trigger(); });
648 break;
650 std::string text;
651 for (auto arg : msg.data) {
652 if (arg.name == "text") {
653 text = std::move(arg.value);
654 }
655 }
656 if (text.empty()) {
657 ESP_LOGW(TAG, "No text in STT_END event");
658 return;
659 } else if (text.length() > 500) {
660 text.resize(497);
661 text += "...";
662 }
663 ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
664 this->defer([this, text]() { this->stt_end_trigger_->trigger(text); });
665 break;
666 }
668 ESP_LOGD(TAG, "Intent started");
669 this->defer([this]() { this->intent_start_trigger_->trigger(); });
670 break;
672 ESP_LOGD(TAG, "Intent progress");
673 std::string tts_url_for_trigger = "";
674#ifdef USE_MEDIA_PLAYER
675 if (this->media_player_ != nullptr) {
676 for (const auto &arg : msg.data) {
677 if ((arg.name == "tts_start_streaming") && (arg.value == "1") && !this->tts_response_url_.empty()) {
679
681
682 this->started_streaming_tts_ = true;
684
685 tts_url_for_trigger = this->tts_response_url_;
686 this->tts_response_url_.clear(); // Reset streaming URL
688 }
689 }
690 }
691#endif
692 this->defer([this, tts_url_for_trigger]() { this->intent_progress_trigger_->trigger(tts_url_for_trigger); });
693 break;
694 }
696 for (auto arg : msg.data) {
697 if (arg.name == "conversation_id") {
698 this->conversation_id_ = std::move(arg.value);
699 } else if (arg.name == "continue_conversation") {
700 this->continue_conversation_ = (arg.value == "1");
701 }
702 }
703 this->defer([this]() { this->intent_end_trigger_->trigger(); });
704 break;
705 }
707 std::string text;
708 for (auto arg : msg.data) {
709 if (arg.name == "text") {
710 text = std::move(arg.value);
711 }
712 }
713 if (text.empty()) {
714 ESP_LOGW(TAG, "No text in TTS_START event");
715 return;
716 }
717 if (text.length() > 500) {
718 text.resize(497);
719 text += "...";
720 }
721 ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
722 this->defer([this, text]() {
723 this->tts_start_trigger_->trigger(text);
724#ifdef USE_SPEAKER
725 if (this->speaker_ != nullptr) {
726 this->speaker_->start();
727 }
728#endif
729 });
730 break;
731 }
733 std::string url;
734 for (auto arg : msg.data) {
735 if (arg.name == "url") {
736 url = std::move(arg.value);
737 }
738 }
739 if (url.empty()) {
740 ESP_LOGW(TAG, "No url in TTS_END event");
741 return;
742 }
743 ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
744 this->defer([this, url]() {
745#ifdef USE_MEDIA_PLAYER
746 if ((this->media_player_ != nullptr) && (!this->started_streaming_tts_)) {
748
750
752 }
753 this->started_streaming_tts_ = false; // Helps indicate reaching the TTS_END stage
754#endif
755 this->tts_end_trigger_->trigger(url);
756 });
758 if (new_state != this->state_) {
759 // Don't needlessly change the state. The intent progress stage may have already changed the state to streaming
760 // response.
761 this->set_state_(new_state, new_state);
762 }
763 break;
764 }
766 ESP_LOGD(TAG, "Assist Pipeline ended");
767 if ((this->state_ == State::START_PIPELINE) || (this->state_ == State::STARTING_PIPELINE) ||
769 // Microphone is running, stop it
771 } else if (this->state_ == State::AWAITING_RESPONSE) {
772 // No TTS start event ("nevermind")
774 }
775 this->defer([this]() { this->end_trigger_->trigger(); });
776 break;
777 }
779 std::string code = "";
780 std::string message = "";
781 for (auto arg : msg.data) {
782 if (arg.name == "code") {
783 code = std::move(arg.value);
784 } else if (arg.name == "message") {
785 message = std::move(arg.value);
786 }
787 }
788 if (code == "wake-word-timeout" || code == "wake_word_detection_aborted" || code == "no_wake_word") {
789 // Don't change state here since either the "tts-end" or "run-end" events will do it.
790 return;
791 } else if (code == "wake-provider-missing" || code == "wake-engine-missing") {
792 // Wake word is not set up or not ready on Home Assistant so stop and do not retry until user starts again.
793 this->defer([this, code, message]() {
794 this->request_stop();
795 this->error_trigger_->trigger(code, message);
796 });
797 return;
798 }
799 ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str());
800 if (this->state_ != State::IDLE) {
801 this->signal_stop_();
803 }
804 this->defer([this, code, message]() { this->error_trigger_->trigger(code, message); });
805 break;
806 }
808#ifdef USE_SPEAKER
809 if (this->speaker_ != nullptr) {
810 this->wait_for_stream_end_ = true;
811 ESP_LOGD(TAG, "TTS stream start");
812 this->defer([this] { this->tts_stream_start_trigger_->trigger(); });
813 }
814#endif
815 break;
816 }
818#ifdef USE_SPEAKER
819 if (this->speaker_ != nullptr) {
820 this->stream_ended_ = true;
821 ESP_LOGD(TAG, "TTS stream end");
822 }
823#endif
824 break;
825 }
827 ESP_LOGD(TAG, "Starting STT by VAD");
828 this->defer([this]() { this->stt_vad_start_trigger_->trigger(); });
829 break;
831 ESP_LOGD(TAG, "STT by VAD end");
833 this->defer([this]() { this->stt_vad_end_trigger_->trigger(); });
834 break;
835 default:
836 ESP_LOGD(TAG, "Unhandled event type: %" PRId32, msg.event_type);
837 break;
838 }
839}
840
842#ifdef USE_SPEAKER // We should never get to this function if there is no speaker anyway
843 if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
844 if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) {
845 memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length());
846 this->speaker_buffer_index_ += msg.data.length();
847 this->speaker_buffer_size_ += msg.data.length();
848 this->speaker_bytes_received_ += msg.data.length();
849 ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data.length());
850 } else {
851 ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
852 }
853 }
854#endif
855}
856
858 Timer timer = {
859 .id = msg.timer_id,
860 .name = msg.name,
861 .total_seconds = msg.total_seconds,
862 .seconds_left = msg.seconds_left,
863 .is_active = msg.is_active,
864 };
865 this->timers_[timer.id] = timer;
866 ESP_LOGD(TAG, "Timer Event");
867 ESP_LOGD(TAG, " Type: %" PRId32, msg.event_type);
868 ESP_LOGD(TAG, " %s", timer.to_string().c_str());
869
870 switch (msg.event_type) {
872 this->timer_started_trigger_->trigger(timer);
873 break;
875 this->timer_updated_trigger_->trigger(timer);
876 break;
878 this->timer_cancelled_trigger_->trigger(timer);
879 this->timers_.erase(timer.id);
880 break;
882 this->timer_finished_trigger_->trigger(timer);
883 this->timers_.erase(timer.id);
884 break;
885 }
886
887 if (this->timers_.empty()) {
888 this->cancel_interval("timer-event");
889 this->timer_tick_running_ = false;
890 } else if (!this->timer_tick_running_) {
891 this->set_interval("timer-event", 1000, [this]() { this->timer_tick_(); });
892 this->timer_tick_running_ = true;
893 }
894}
895
897 std::vector<Timer> res;
898 res.reserve(this->timers_.size());
899 for (auto &pair : this->timers_) {
900 auto &timer = pair.second;
901 if (timer.is_active && timer.seconds_left > 0) {
902 timer.seconds_left--;
903 }
904 res.push_back(timer);
905 }
906 this->timer_tick_trigger_->trigger(res);
907}
908
910#ifdef USE_MEDIA_PLAYER
911 if (this->media_player_ != nullptr) {
912 this->tts_start_trigger_->trigger(msg.text);
913
915
916 if (!msg.preannounce_media_id.empty()) {
918 }
919 // Enqueueing a URL with an empty playlist will still play the file immediately
920 this->media_player_->make_call()
923 .set_announcement(true)
924 .perform();
926
928
929 if (this->continuous_) {
931 } else {
933 }
934
936 this->end_trigger_->trigger();
937 }
938#endif
939}
940
941void VoiceAssistant::on_set_configuration(const std::vector<std::string> &active_wake_words) {
942#ifdef USE_MICRO_WAKE_WORD
943 if (this->micro_wake_word_) {
944 // Disable all wake words first
945 for (auto &model : this->micro_wake_word_->get_wake_words()) {
946 model->disable();
947 }
948
949 // Enable only active wake words
950 for (auto ww_id : active_wake_words) {
951 for (auto &model : this->micro_wake_word_->get_wake_words()) {
952 if (model->get_id() == ww_id) {
953 model->enable();
954 ESP_LOGD(TAG, "Enabled wake word: %s (id=%s)", model->get_wake_word().c_str(), model->get_id().c_str());
955 }
956 }
957 }
958 }
959#endif
960};
961
963 this->config_.available_wake_words.clear();
964 this->config_.active_wake_words.clear();
965
966#ifdef USE_MICRO_WAKE_WORD
967 if (this->micro_wake_word_) {
969
970 for (auto &model : this->micro_wake_word_->get_wake_words()) {
971 if (model->is_enabled()) {
972 this->config_.active_wake_words.push_back(model->get_id());
973 }
974
975 WakeWord wake_word;
976 wake_word.id = model->get_id();
977 wake_word.wake_word = model->get_wake_word();
978 for (const auto &lang : model->get_trained_languages()) {
979 wake_word.trained_languages.push_back(lang);
980 }
981 this->config_.available_wake_words.push_back(std::move(wake_word));
982 }
983 } else {
984#endif
985 // No microWakeWord
987#ifdef USE_MICRO_WAKE_WORD
988 }
989#endif
990
991 return this->config_;
992};
993
994VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
995
996} // namespace voice_assistant
997} // namespace esphome
998
999#endif // USE_VOICE_ASSISTANT
virtual void mark_failed()
Mark this component as failed.
void set_interval(const std::string &name, uint32_t interval, std::function< void()> &&f)
Set an interval function with a unique name.
Definition component.cpp:98
bool cancel_timeout(const std::string &name)
Cancel a timeout function.
bool status_has_error() const
bool cancel_interval(const std::string &name)
Cancel an interval function.
void defer(const std::string &name, std::function< void()> &&f)
Defer a callback to the next loop() call.
void set_timeout(const std::string &name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
void status_set_error(const char *message=nullptr)
An STL allocator that uses SPI or internal RAM.
Definition helpers.h:1084
void deallocate(T *p, size_t n)
Definition helpers.h:1142
T * allocate(size_t n)
Definition helpers.h:1104
static std::unique_ptr< RingBuffer > create(size_t len)
StringRef is a reference to a string owned by something else.
Definition string_ref.h:22
void trigger(const Ts &...x)
Inform the parent automation that the event has triggered.
Definition automation.h:169
const std::string & get_peername() const
const std::string & get_name() const
bool send_message(const ProtoMessage &msg, uint8_t message_type)
static constexpr uint8_t MESSAGE_TYPE
Definition api_pb2.h:2467
void set_data(const uint8_t *data, size_t len)
Definition api_pb2.h:2410
static constexpr uint8_t MESSAGE_TYPE
Definition api_pb2.h:2402
enums::VoiceAssistantEvent event_type
Definition api_pb2.h:2390
std::vector< VoiceAssistantEventData > data
Definition api_pb2.h:2391
void set_wake_word_phrase(const StringRef &ref)
Definition api_pb2.h:2347
static constexpr uint8_t MESSAGE_TYPE
Definition api_pb2.h:2336
VoiceAssistantAudioSettings audio_settings
Definition api_pb2.h:2345
void set_conversation_id(const StringRef &ref)
Definition api_pb2.h:2343
enums::VoiceAssistantTimerEvent event_type
Definition api_pb2.h:2432
MediaPlayerCall & set_media_url(const std::string &url)
MediaPlayerCall & set_announcement(bool announce)
MediaPlayerCall & set_command(MediaPlayerCommand command)
void add_on_state_callback(std::function< void()> &&callback)
std::vector< WakeWordModel * > get_wake_words()
void add_data_callback(std::function< void(const std::vector< uint8_t > &)> &&data_callback)
virtual size_t play(const uint8_t *data, size_t length)=0
Plays the provided audio data.
bool is_running() const
Definition speaker.h:66
virtual bool has_buffered_data() const =0
virtual void start()=0
virtual void stop()=0
std::unique_ptr< socket::Socket > socket_
std::unordered_map< std::string, Timer > timers_
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
media_player::MediaPlayer * media_player_
Trigger< std::string, std::string > * error_trigger_
void client_subscription(api::APIConnection *client, bool subscribe)
MediaPlayerResponseState media_player_response_state_
Trigger< std::vector< Timer > > * timer_tick_trigger_
std::shared_ptr< RingBuffer > ring_buffer_
void on_event(const api::VoiceAssistantEventResponse &msg)
Trigger< std::string > * tts_start_trigger_
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
void request_start(bool continuous, bool silence_detection)
Trigger< std::string > * intent_progress_trigger_
microphone::MicrophoneSource * mic_source_
micro_wake_word::MicroWakeWord * micro_wake_word_
void on_set_configuration(const std::vector< std::string > &active_wake_words)
const char * message
Definition component.cpp:38
uint16_t flags
bool state
Definition fan.h:0
uint32_t socklen_t
Definition headers.h:97
__int64 ssize_t
Definition httplib.h:178
@ VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD
Definition api_pb2.h:208
@ VOICE_ASSISTANT_REQUEST_USE_VAD
Definition api_pb2.h:207
@ VOICE_ASSISTANT_TIMER_UPDATED
Definition api_pb2.h:231
@ VOICE_ASSISTANT_TIMER_STARTED
Definition api_pb2.h:230
@ VOICE_ASSISTANT_TIMER_FINISHED
Definition api_pb2.h:233
@ VOICE_ASSISTANT_TIMER_CANCELLED
Definition api_pb2.h:232
@ VOICE_ASSISTANT_WAKE_WORD_START
Definition api_pb2.h:221
@ VOICE_ASSISTANT_TTS_STREAM_END
Definition api_pb2.h:226
@ VOICE_ASSISTANT_STT_VAD_START
Definition api_pb2.h:223
@ VOICE_ASSISTANT_INTENT_PROGRESS
Definition api_pb2.h:227
@ VOICE_ASSISTANT_TTS_STREAM_START
Definition api_pb2.h:225
@ VOICE_ASSISTANT_WAKE_WORD_END
Definition api_pb2.h:222
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Definition component.cpp:67
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
Definition socket.cpp:82
VoiceAssistant * global_voice_assistant
Providing packet encoding functions for exchanging data with a remote host.
Definition a01nyub.cpp:7
std::vector< WakeWord > available_wake_words
std::vector< std::string > active_wake_words
std::vector< std::string > trained_languages
sa_family_t ss_family
Definition headers.h:92