4#ifdef USE_VOICE_ASSISTANT
15static const char *
const TAG =
"voice_assistant";
21static const size_t SAMPLE_RATE_HZ = 16000;
23static const size_t RING_BUFFER_SAMPLES = 512 * SAMPLE_RATE_HZ / 1000;
24static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES *
sizeof(int16_t);
25static const size_t SEND_BUFFER_SAMPLES = 32 * SAMPLE_RATE_HZ / 1000;
26static const size_t SEND_BUFFER_SIZE = SEND_BUFFER_SAMPLES *
sizeof(int16_t);
27static const size_t RECEIVE_SIZE = 1024;
28static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
33static const uint32_t AUDIO_CHANNEL_STALL_TIMEOUT_MS = 2000;
39 std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->
ring_buffer_.lock();
40 if (temp_ring_buffer !=
nullptr) {
41 temp_ring_buffer->write((
void *) data.data(), data.size());
48 std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->
ring_buffer2_.lock();
49 if (temp_ring_buffer !=
nullptr) {
50 temp_ring_buffer->write((
void *) data.data(), data.size());
55#ifdef USE_MEDIA_PLAYER
82 ESP_LOGE(TAG,
"Could not create socket");
87 int err = this->
socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable,
sizeof(
int));
89 ESP_LOGW(TAG,
"Socket unable to set reuseaddr: errno %d", err);
92 err = this->
socket_->setblocking(
false);
94 ESP_LOGE(TAG,
"Socket unable to set nonblocking mode: errno %d", err);
105 ESP_LOGE(TAG,
"Socket unable to set sockaddr: errno %d", errno);
110 err = this->
socket_->bind((
struct sockaddr *) &server,
sizeof(server));
112 ESP_LOGE(TAG,
"Socket unable to bind: errno %d", errno);
128 ESP_LOGW(TAG,
"Could not allocate speaker buffer");
136 if (temp_ring_buffer ==
nullptr) {
137 ESP_LOGE(TAG,
"Could not allocate ring buffer");
143 ESP_LOGE(TAG,
"Could not allocate audio source");
152 if (temp_ring_buffer ==
nullptr) {
153 ESP_LOGE(TAG,
"Could not allocate second ring buffer");
158 ESP_LOGE(TAG,
"Could not allocate second audio source");
209 ESP_LOGD(TAG,
"reset conversation ID");
224 size_t available2 = 0;
230 const bool channel_empty = (available == 0);
231 const bool channel2_empty = (this->
audio_source2_ !=
nullptr) && (available2 == 0);
232 if (channel_empty || channel2_empty) {
265 if ((available == 0) && (available2 == 0)) {
275 ESP_LOGW(TAG,
"Mic channel %d stalled, stopping stream", (available == 0) ? 0 : 1);
279 this->
defer([
this]() {
310 ESP_LOGD(TAG,
"Starting Microphone");
334 ESP_LOGD(TAG,
"Requesting start");
353#ifdef USE_MEDIA_PLAYER
360 ESP_LOGW(TAG,
"Could not request start");
384 if (available == 0) {
394 sizeof(this->dest_addr_));
403 bool is_running2 =
false;
407 if (is_running || is_running2) {
423 bool is_stopped2 =
true;
427 if (is_stopped && is_stopped2) {
436 bool playing =
false;
443 if (received_len > 0) {
449 ESP_LOGD(TAG,
"Receive buffer full");
459 ESP_LOGD(TAG,
"End of audio stream received");
468#ifdef USE_MEDIA_PLAYER
475 ESP_LOGD(TAG,
"Announcement finished playing");
500 ESP_LOGD(TAG,
"Speaker has finished outputting all audio");
537 ESP_LOGV(TAG,
"Speaker buffer full, trying again next loop");
547 ESP_LOGE(TAG,
"Client attempting to unsubscribe that is not the current API Client");
556 char current_peername[socket::SOCKADDR_STR_LEN];
557 char new_peername[socket::SOCKADDR_STR_LEN];
559 "Multiple API Clients attempting to connect to Voice Assistant\n"
560 " Current client: %s (%s)\n"
561 " New client: %s (%s)",
571static const LogString *voice_assistant_state_to_string(
State state) {
574 return LOG_STR(
"IDLE");
576 return LOG_STR(
"START_MICROPHONE");
578 return LOG_STR(
"STARTING_MICROPHONE");
580 return LOG_STR(
"WAIT_FOR_VAD");
582 return LOG_STR(
"WAITING_FOR_VAD");
584 return LOG_STR(
"START_PIPELINE");
586 return LOG_STR(
"STARTING_PIPELINE");
588 return LOG_STR(
"STREAMING_MICROPHONE");
590 return LOG_STR(
"STOP_MICROPHONE");
592 return LOG_STR(
"STOPPING_MICROPHONE");
594 return LOG_STR(
"AWAITING_RESPONSE");
596 return LOG_STR(
"STREAMING_RESPONSE");
598 return LOG_STR(
"RESPONSE_FINISHED");
600 return LOG_STR(
"UNKNOWN");
607 ESP_LOGD(TAG,
"State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
608 LOG_STR_ARG(voice_assistant_state_to_string(
state)));
614 ESP_LOGD(TAG,
"Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
618 ESP_LOGE(TAG,
"Failed to start server. See Home Assistant logs for more details.");
619 this->
error_trigger_.
trigger(
"failed-to-start",
"Failed to start server. See Home Assistant logs for more details.");
629 ESP_LOGD(TAG,
"Client started, streaming microphone");
646 ESP_LOGD(TAG,
"Client started, streaming microphone");
650 ESP_LOGW(TAG,
"UDP audio mode does not support a second microphone channel; only the primary will be streamed");
677 ESP_LOGE(TAG,
"No API client connected");
717#ifdef USE_MEDIA_PLAYER
741 ESP_LOGD(TAG,
"Signaling stop");
761 ESP_LOGD(TAG,
"Event Type: %" PRId32, msg.
event_type);
764 ESP_LOGD(TAG,
"Assist Pipeline running");
765#ifdef USE_MEDIA_PLAYER
767 for (
const auto &arg : msg.
data) {
768 if (arg.name ==
"url") {
778 ESP_LOGD(TAG,
"Wake word detected");
783 ESP_LOGD(TAG,
"STT started");
788 for (
const auto &arg : msg.
data) {
789 if (arg.name ==
"text") {
794 ESP_LOGW(TAG,
"No text in STT_END event");
796 }
else if (text.length() > 500) {
800 ESP_LOGD(TAG,
"Speech recognised as: \"%s\"", text.c_str());
805 ESP_LOGD(TAG,
"Intent started");
809 ESP_LOGD(TAG,
"Intent progress");
810 std::string tts_url_for_trigger;
811#ifdef USE_MEDIA_PLAYER
813 for (
const auto &arg : msg.
data) {
814 if ((arg.name ==
"tts_start_streaming") && (arg.value ==
"1") && !this->tts_response_url_.empty()) {
833 for (
const auto &arg : msg.
data) {
834 if (arg.name ==
"conversation_id") {
836 }
else if (arg.name ==
"continue_conversation") {
845 for (
const auto &arg : msg.
data) {
846 if (arg.name ==
"text") {
851 ESP_LOGW(TAG,
"No text in TTS_START event");
854 if (text.length() > 500) {
858 ESP_LOGD(TAG,
"Response: \"%s\"", text.c_str());
859 this->
defer([
this, text]() {
871 for (
const auto &arg : msg.
data) {
872 if (arg.name ==
"url") {
877 ESP_LOGW(TAG,
"No url in TTS_END event");
880 ESP_LOGD(TAG,
"Response URL: \"%s\"", url.c_str());
881 this->
defer([
this, url]() {
882#ifdef USE_MEDIA_PLAYER
895 if (new_state != this->
state_) {
903 ESP_LOGD(TAG,
"Assist Pipeline ended");
918 for (
const auto &arg : msg.
data) {
919 if (arg.name ==
"code") {
921 }
else if (arg.name ==
"message") {
925 if (code ==
"wake-word-timeout" || code ==
"wake_word_detection_aborted" || code ==
"no_wake_word") {
928 }
else if (code ==
"wake-provider-missing" || code ==
"wake-engine-missing") {
936 ESP_LOGE(TAG,
"Error: %s - %s", code.c_str(),
message.c_str());
948 ESP_LOGD(TAG,
"TTS stream start");
958 ESP_LOGD(TAG,
"TTS stream end");
964 ESP_LOGD(TAG,
"Starting STT by VAD");
968 ESP_LOGD(TAG,
"STT by VAD end");
973 ESP_LOGD(TAG,
"Unhandled event type: %" PRId32, msg.
event_type);
986 ESP_LOGV(TAG,
"Received audio: %u bytes from API", msg.
data_len);
988 ESP_LOGE(TAG,
"Cannot receive audio, buffer is full");
996 auto it = this->
timers_.begin();
997 for (; it != this->
timers_.end(); ++it) {
1001 if (it == this->
timers_.end()) {
1006 it->name = msg.
name;
1014 " Type: %" PRId32
"\n"
1045 for (
auto &timer : this->
timers_) {
1046 if (timer.is_active && timer.seconds_left > 0) {
1047 timer.seconds_left--;
1054#ifdef USE_MEDIA_PLAYER
1086#ifdef USE_MICRO_WAKE_WORD
1094 for (
const auto &ww_id : active_wake_words) {
1096 if (model->get_id() == ww_id) {
1098 ESP_LOGD(TAG,
"Enabled wake word: %s (id=%s)", model->get_wake_word().c_str(), model->get_id().c_str());
1110#ifdef USE_MICRO_WAKE_WORD
1115 if (model->is_enabled()) {
1120 wake_word.
id = model->get_id();
1121 wake_word.
wake_word = model->get_wake_word();
1122 for (
const auto &lang : model->get_trained_languages()) {
1131#ifdef USE_MICRO_WAKE_WORD
uint32_t IRAM_ATTR HOT get_loop_component_start_time() const
Get the cached time in milliseconds from when the current component started its loop execution.
void mark_failed()
Mark this component as failed.
ESPDEPRECATED("Use const char* overload instead. Removed in 2026.7.0", "2026.1.0") void defer(const std voi defer)(const char *name, std::function< void()> &&f)
Defer a callback to the next loop() call.
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") void set_timeout(const std voi set_timeout)(const char *name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
void status_clear_error()
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") void set_interval(const std voi set_interval)(const char *name, uint32_t interval, std::function< void()> &&f)
Set an interval function with a unique name.
bool status_has_error() const
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") bool cancel_timeout(const std boo cancel_timeout)(const char *name)
Cancel a timeout function.
ESPDEPRECATED("Use const char* or uint32_t overload instead. Removed in 2026.7.0", "2026.1.0") bool cancel_interval(const std boo cancel_interval)(const char *name)
Cancel an interval function.
An STL allocator that uses SPI or internal RAM.
void deallocate(T *p, size_t n)
StringRef is a reference to a string owned by something else.
constexpr bool empty() const
void trigger(const Ts &...x) ESPHOME_ALWAYS_INLINE
Inform the parent automation that the event has triggered.
const char * get_peername_to(std::span< char, socket::SOCKADDR_STR_LEN > buf) const
Get peer name (IP address) into caller-provided buffer, returns buf for convenience.
const char * get_name() const
bool send_message(const T &msg)
StringRef preannounce_media_id
uint32_t noise_suppression_level
enums::VoiceAssistantEvent event_type
std::vector< VoiceAssistantEventData > data
StringRef conversation_id
StringRef wake_word_phrase
VoiceAssistantAudioSettings audio_settings
enums::VoiceAssistantTimerEvent event_type
static std::unique_ptr< RingBufferAudioSource > create(std::shared_ptr< ring_buffer::RingBuffer > ring_buffer, size_t max_fill_bytes, uint8_t alignment_bytes=1)
Creates a new ring-buffer-backed audio source after validating its parameters.
std::vector< WakeWordModel * > get_wake_words()
void add_data_callback(F &&data_callback)
static std::unique_ptr< RingBuffer > create(size_t len, MemoryPreference preference=MemoryPreference::EXTERNAL_FIRST)
virtual size_t play(const uint8_t *data, size_t length)=0
Plays the provided audio data.
virtual bool has_buffered_data() const =0
Trigger intent_end_trigger_
std::unique_ptr< socket::Socket > socket_
const Configuration & get_configuration()
size_t speaker_buffer_size_
bool started_streaming_tts_
std::string tts_response_url_
microphone::MicrophoneSource * mic_source2_
bool wait_for_stream_end_
size_t speaker_buffer_index_
Trigger intent_start_trigger_
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
Trigger tts_stream_start_trigger_
Trigger client_connected_trigger_
Trigger< Timer > timer_cancelled_trigger_
std::vector< Timer > timers_
void on_audio(const api::VoiceAssistantAudio &msg)
std::unique_ptr< audio::RingBufferAudioSource > audio_source_
std::weak_ptr< ring_buffer::RingBuffer > ring_buffer2_
std::weak_ptr< ring_buffer::RingBuffer > ring_buffer_
media_player::MediaPlayer * media_player_
float get_setup_priority() const override
Trigger wake_word_detected_trigger_
void set_state_(State state)
uint8_t * speaker_buffer_
Trigger stt_vad_end_trigger_
void client_subscription(api::APIConnection *client, bool subscribe)
Trigger< Timer > timer_started_trigger_
MediaPlayerResponseState media_player_response_state_
void deallocate_buffers_()
Trigger listening_trigger_
uint8_t noise_suppression_level_
void on_event(const api::VoiceAssistantEventResponse &msg)
Trigger< std::string, std::string > error_trigger_
Trigger< std::string > stt_end_trigger_
uint32_t conversation_timeout_
Trigger< const std::vector< Timer > & > timer_tick_trigger_
uint32_t audio_channel_stall_start_
Trigger< std::string > intent_progress_trigger_
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
void request_start(bool continuous, bool silence_detection)
Trigger< std::string > tts_start_trigger_
api::APIConnection * api_client_
Trigger< std::string > tts_end_trigger_
struct sockaddr_storage dest_addr_
speaker::Speaker * speaker_
Trigger< Timer > timer_updated_trigger_
void handle_channel_stall_(size_t available, size_t available2)
std::unique_ptr< audio::RingBufferAudioSource > audio_source2_
Trigger tts_stream_end_trigger_
Trigger stt_vad_start_trigger_
void start_playback_timeout_()
void reset_conversation_id()
std::string conversation_id_
microphone::MicrophoneSource * mic_source_
micro_wake_word::MicroWakeWord * micro_wake_word_
size_t speaker_bytes_received_
bool continue_conversation_
void on_set_configuration(const std::vector< std::string > &active_wake_words)
Trigger< Timer > timer_finished_trigger_
Trigger client_disconnected_trigger_
const LogString * message
@ VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD
@ VOICE_ASSISTANT_REQUEST_USE_VAD
@ VOICE_ASSISTANT_TIMER_UPDATED
@ VOICE_ASSISTANT_TIMER_STARTED
@ VOICE_ASSISTANT_TIMER_FINISHED
@ VOICE_ASSISTANT_TIMER_CANCELLED
@ VOICE_ASSISTANT_INTENT_END
@ VOICE_ASSISTANT_RUN_START
@ VOICE_ASSISTANT_TTS_END
@ VOICE_ASSISTANT_RUN_END
@ VOICE_ASSISTANT_WAKE_WORD_START
@ VOICE_ASSISTANT_TTS_STREAM_END
@ VOICE_ASSISTANT_STT_END
@ VOICE_ASSISTANT_STT_VAD_START
@ VOICE_ASSISTANT_INTENT_PROGRESS
@ VOICE_ASSISTANT_TTS_START
@ VOICE_ASSISTANT_STT_START
@ VOICE_ASSISTANT_INTENT_START
@ VOICE_ASSISTANT_TTS_STREAM_START
@ VOICE_ASSISTANT_STT_VAD_END
@ VOICE_ASSISTANT_WAKE_WORD_END
constexpr float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
VoiceAssistant * global_voice_assistant
Application App
Global storage of Application pointer - only one Application can exist.
std::vector< WakeWord > available_wake_words
std::vector< std::string > active_wake_words
uint32_t max_active_wake_words
static constexpr size_t TO_STR_BUFFER_SIZE
Buffer size for to_str() - sufficient for typical timer names.
std::vector< std::string > trained_languages