ESPHome 2026.6.0-dev
Loading...
Searching...
No Matches
voice_assistant.h
Go to the documentation of this file.
1#pragma once
2
4
5#ifdef USE_VOICE_ASSISTANT
6
10
16#ifdef USE_MEDIA_PLAYER
18#endif
19#ifdef USE_MICRO_WAKE_WORD
21#endif
22#ifdef USE_SPEAKER
24#endif
26
27#include <span>
28#include <vector>
29
31
32// Version 1: Initial version
33// Version 2: Adds raw speaker support
34static const uint32_t LEGACY_INITIAL_VERSION = 1;
35static const uint32_t LEGACY_SPEAKER_SUPPORT = 2;
36
46
62
67
68struct Timer {
69 std::string id;
70 std::string name;
74
76 static constexpr size_t TO_STR_BUFFER_SIZE = 128;
78 const char *to_str(std::span<char, TO_STR_BUFFER_SIZE> buffer) const {
79 snprintf(buffer.data(), buffer.size(),
80 "Timer(id=%s, name=%s, total_seconds=%" PRIu32 ", seconds_left=%" PRIu32 ", is_active=%s)",
81 this->id.c_str(), this->name.c_str(), this->total_seconds, this->seconds_left, YESNO(this->is_active));
82 return buffer.data();
83 }
84 // Remove before 2026.8.0
85 ESPDEPRECATED("Use to_str() instead. Removed in 2026.8.0", "2026.2.0")
86 std::string to_string() const { // NOLINT
87 char buffer[TO_STR_BUFFER_SIZE];
88 return this->to_str(buffer);
89 }
90};
91
92struct WakeWord {
93 std::string id;
94 std::string wake_word;
95 std::vector<std::string> trained_languages;
96};
97
99 std::vector<WakeWord> available_wake_words;
100 std::vector<std::string> active_wake_words;
102};
103
104#ifdef USE_MEDIA_PLAYER
106 IDLE,
107 URL_SENT,
108 PLAYING,
109 FINISHED,
110};
111#endif
112
113class VoiceAssistant : public Component {
114 public:
116
117 void loop() override;
118 void setup() override;
119 float get_setup_priority() const override;
120 void start_streaming();
121 void start_streaming(struct sockaddr_storage *addr, uint16_t port);
122 void failed_to_start();
123
124 void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; }
125 void set_microphone_source2(microphone::MicrophoneSource *mic_source2) { this->mic_source2_ = mic_source2; }
126#ifdef USE_MICRO_WAKE_WORD
128#endif
129#ifdef USE_SPEAKER
131 this->speaker_ = speaker;
132 this->local_output_ = true;
133 }
134#endif
135#ifdef USE_MEDIA_PLAYER
137 this->media_player_ = media_player;
138 this->local_output_ = true;
139 }
140#endif
141
143#ifdef USE_SPEAKER
144 if (this->speaker_ != nullptr) {
145 return LEGACY_SPEAKER_SUPPORT;
146 }
147#endif
148 return LEGACY_INITIAL_VERSION;
149 }
150
152 uint32_t flags = 0;
155 if (this->mic_source2_ != nullptr) {
157 }
158#ifdef USE_SPEAKER
159 if (this->speaker_ != nullptr) {
161 }
162#endif
163
164 if (this->has_timers_) {
166 }
167
168#ifdef USE_MEDIA_PLAYER
169 if (this->media_player_ != nullptr) {
172 }
173#endif
174
175 return flags;
176 }
177
178 void request_start(bool continuous, bool silence_detection);
179 void request_stop();
180
182 void on_audio(const api::VoiceAssistantAudio &msg);
185 void on_set_configuration(const std::vector<std::string> &active_wake_words);
187
188 bool is_running() const { return this->state_ != State::IDLE; }
189 void set_continuous(bool continuous) { this->continuous_ = continuous; }
190 bool is_continuous() const { return this->continuous_; }
191
192 void set_use_wake_word(bool use_wake_word) { this->use_wake_word_ = use_wake_word; }
193
194 void set_noise_suppression_level(uint8_t noise_suppression_level) {
195 this->noise_suppression_level_ = noise_suppression_level;
196 }
197 void set_auto_gain(uint8_t auto_gain) { this->auto_gain_ = auto_gain; }
198 void set_volume_multiplier(float volume_multiplier) { this->volume_multiplier_ = volume_multiplier; }
199 void set_conversation_timeout(uint32_t conversation_timeout) { this->conversation_timeout_ = conversation_timeout; }
201
210#ifdef USE_SPEAKER
213#endif
220
223
224 void client_subscription(api::APIConnection *client, bool subscribe);
226
227 void set_wake_word(const std::string &wake_word) { this->wake_word_ = wake_word; }
228
234 void set_has_timers(bool has_timers) { this->has_timers_ = has_timers; }
235 const std::vector<Timer> &get_timers() const { return this->timers_; }
236
237 protected:
238 bool allocate_buffers_();
239 void clear_buffers_();
240 void deallocate_buffers_();
241
242 void set_state_(State state);
243 void set_state_(State state, State desired_state);
244 void signal_stop_();
246
247 // Drains the exposed microphone audio and sends it to Home Assistant over the API in one loop() pass.
248 void stream_api_audio_();
249 // Handles a pass where at least one configured channel has no audio exposed, timing out a channel that
250 // stalls. See audio_channel_stall_start_.
251 void handle_channel_stall_(size_t available, size_t available2);
252
253 std::unique_ptr<socket::Socket> socket_ = nullptr;
255
263#ifdef USE_SPEAKER
266#endif
274
277
279
280 std::vector<Timer> timers_;
281 void timer_tick_();
287 bool has_timers_{false};
289
292#ifdef USE_SPEAKER
293 void write_speaker_();
295 uint8_t *speaker_buffer_{nullptr};
300 bool stream_ended_{false};
301#endif
302#ifdef USE_MEDIA_PLAYER
304 std::string tts_response_url_;
306
308#endif
309
310 bool local_output_{false};
311
312 std::string conversation_id_;
313
314 std::string wake_word_;
315
316 // Zero-copy sources that read directly from each microphone channel's ring buffer internal storage.
317 // Each source owns its ring buffer; the matching ``ring_buffer_``/``ring_buffer2_`` weak_ptr is used by
318 // the microphone callback (a different thread) to write into it.
319 std::unique_ptr<audio::RingBufferAudioSource> audio_source_;
320 std::unique_ptr<audio::RingBufferAudioSource> audio_source2_;
321 std::weak_ptr<ring_buffer::RingBuffer> ring_buffer_;
322 std::weak_ptr<ring_buffer::RingBuffer> ring_buffer2_;
323
324 // When streaming multiple channels, the send loop holds an exposed chunk on one channel until the other
325 // channel also has audio so the channels are always sent together (an empty payload looks like
326 // end-of-stream to Home Assistant). Home Assistant has no stream timeout, so a channel that stops
327 // producing entirely would hang streaming forever. This records when such an imbalance began so a
328 // prolonged one can be detected and stopped; 0 means no imbalance is currently being timed.
330
333 uint8_t auto_gain_;
336
337 bool continuous_{false};
339
341
344
347 bool start_udp_socket_();
348
350
351#ifdef USE_MICRO_WAKE_WORD
353#endif
354};
355
356template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> {
357 TEMPLATABLE_VALUE(std::string, wake_word);
358
359 public:
360 void play(const Ts &...x) override {
361 this->parent_->set_wake_word(this->wake_word_.value(x...));
362 this->parent_->request_start(false, this->silence_detection_);
363 }
364
365 void set_silence_detection(bool silence_detection) { this->silence_detection_ = silence_detection; }
366
367 protected:
369};
370
371template<typename... Ts> class StartContinuousAction : public Action<Ts...>, public Parented<VoiceAssistant> {
372 public:
373 void play(const Ts &...x) override { this->parent_->request_start(true, true); }
374};
375
376template<typename... Ts> class StopAction : public Action<Ts...>, public Parented<VoiceAssistant> {
377 public:
378 void play(const Ts &...x) override { this->parent_->request_stop(); }
379};
380
381template<typename... Ts> class IsRunningCondition : public Condition<Ts...>, public Parented<VoiceAssistant> {
382 public:
383 bool check(const Ts &...x) override { return this->parent_->is_running() || this->parent_->is_continuous(); }
384};
385
386template<typename... Ts> class ConnectedCondition : public Condition<Ts...>, public Parented<VoiceAssistant> {
387 public:
388 bool check(const Ts &...x) override { return this->parent_->get_api_connection() != nullptr; }
389};
390
391extern VoiceAssistant *global_voice_assistant; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
392
393} // namespace esphome::voice_assistant
394
395#endif // USE_VOICE_ASSISTANT
Base class for all automation conditions.
Definition automation.h:438
Helper class to easily give an object a parent of type T.
Definition helpers.h:1861
void play(const Ts &...x) override
void set_silence_detection(bool silence_detection)
void play(const Ts &...x) override
std::unique_ptr< socket::Socket > socket_
void set_conversation_timeout(uint32_t conversation_timeout)
microphone::MicrophoneSource * mic_source2_
Trigger< std::string > * get_stt_end_trigger()
Trigger< std::string > * get_intent_progress_trigger()
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
Trigger< std::string > * get_tts_end_trigger()
std::unique_ptr< audio::RingBufferAudioSource > audio_source_
const std::vector< Timer > & get_timers() const
std::weak_ptr< ring_buffer::RingBuffer > ring_buffer2_
std::weak_ptr< ring_buffer::RingBuffer > ring_buffer_
media_player::MediaPlayer * media_player_
void set_media_player(media_player::MediaPlayer *media_player)
void client_subscription(api::APIConnection *client, bool subscribe)
MediaPlayerResponseState media_player_response_state_
void on_event(const api::VoiceAssistantEventResponse &msg)
Trigger< std::string, std::string > error_trigger_
Trigger< const std::vector< Timer > & > timer_tick_trigger_
Trigger< std::string > intent_progress_trigger_
void set_microphone_source2(microphone::MicrophoneSource *mic_source2)
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
void request_start(bool continuous, bool silence_detection)
void set_speaker(speaker::Speaker *speaker)
Trigger< std::string > * get_tts_start_trigger()
api::APIConnection * get_api_connection() const
void set_microphone_source(microphone::MicrophoneSource *mic_source)
void handle_channel_stall_(size_t available, size_t available2)
std::unique_ptr< audio::RingBufferAudioSource > audio_source2_
void set_wake_word(const std::string &wake_word)
void set_micro_wake_word(micro_wake_word::MicroWakeWord *mww)
void set_volume_multiplier(float volume_multiplier)
microphone::MicrophoneSource * mic_source_
micro_wake_word::MicroWakeWord * micro_wake_word_
Trigger< std::string, std::string > * get_error_trigger()
void set_noise_suppression_level(uint8_t noise_suppression_level)
void on_set_configuration(const std::vector< std::string > &active_wake_words)
Trigger< const std::vector< Timer > & > * get_timer_tick_trigger()
uint16_t flags
bool state
Definition fan.h:2
VoiceAssistant * global_voice_assistant
ESPDEPRECATED("Use LightState::gamma_correct_lut() instead. Removed in 2026.9.0.", "2026.3.0") float gamma_correct(float value
Applies gamma correction of gamma to value.
static void uint32_t
std::vector< WakeWord > available_wake_words
std::vector< std::string > active_wake_words
static constexpr size_t TO_STR_BUFFER_SIZE
Buffer size for to_str() - sufficient for typical timer names.
const char * to_str(std::span< char, TO_STR_BUFFER_SIZE > buffer) const
Format to buffer, returns pointer to buffer (may truncate long names)
std::vector< std::string > trained_languages
uint16_t x
Definition tt21100.cpp:5