ESPHome 2026.3.0-dev
Loading...
Searching...
No Matches
voice_assistant.h
Go to the documentation of this file.
1#pragma once
2
4
5#ifdef USE_VOICE_ASSISTANT
6
11
15#ifdef USE_MEDIA_PLAYER
17#endif
18#ifdef USE_MICRO_WAKE_WORD
20#endif
21#ifdef USE_SPEAKER
23#endif
25
26#include <span>
27#include <vector>
28
29namespace esphome {
30namespace voice_assistant {
31
32// Version 1: Initial version
33// Version 2: Adds raw speaker support
34static const uint32_t LEGACY_INITIAL_VERSION = 1;
35static const uint32_t LEGACY_SPEAKER_SUPPORT = 2;
36
45
61
66
67struct Timer {
68 std::string id;
69 std::string name;
70 uint32_t total_seconds;
71 uint32_t seconds_left;
73
75 static constexpr size_t TO_STR_BUFFER_SIZE = 128;
77 const char *to_str(std::span<char, TO_STR_BUFFER_SIZE> buffer) const {
78 snprintf(buffer.data(), buffer.size(),
79 "Timer(id=%s, name=%s, total_seconds=%" PRIu32 ", seconds_left=%" PRIu32 ", is_active=%s)",
80 this->id.c_str(), this->name.c_str(), this->total_seconds, this->seconds_left, YESNO(this->is_active));
81 return buffer.data();
82 }
83 // Remove before 2026.8.0
84 ESPDEPRECATED("Use to_str() instead. Removed in 2026.8.0", "2026.2.0")
85 std::string to_string() const { // NOLINT
86 char buffer[TO_STR_BUFFER_SIZE];
87 return this->to_str(buffer);
88 }
89};
90
91struct WakeWord {
92 std::string id;
93 std::string wake_word;
94 std::vector<std::string> trained_languages;
95};
96
98 std::vector<WakeWord> available_wake_words;
99 std::vector<std::string> active_wake_words;
101};
102
103#ifdef USE_MEDIA_PLAYER
105 IDLE,
106 URL_SENT,
107 PLAYING,
108 FINISHED,
109};
110#endif
111
112class VoiceAssistant : public Component {
113 public:
115
116 void loop() override;
117 void setup() override;
118 float get_setup_priority() const override;
119 void start_streaming();
120 void start_streaming(struct sockaddr_storage *addr, uint16_t port);
121 void failed_to_start();
122
123 void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; }
124#ifdef USE_MICRO_WAKE_WORD
126#endif
127#ifdef USE_SPEAKER
129 this->speaker_ = speaker;
130 this->local_output_ = true;
131 }
132#endif
133#ifdef USE_MEDIA_PLAYER
135 this->media_player_ = media_player;
136 this->local_output_ = true;
137 }
138#endif
139
140 uint32_t get_legacy_version() const {
141#ifdef USE_SPEAKER
142 if (this->speaker_ != nullptr) {
143 return LEGACY_SPEAKER_SUPPORT;
144 }
145#endif
146 return LEGACY_INITIAL_VERSION;
147 }
148
149 uint32_t get_feature_flags() const {
150 uint32_t flags = 0;
153#ifdef USE_SPEAKER
154 if (this->speaker_ != nullptr) {
156 }
157#endif
158
159 if (this->has_timers_) {
161 }
162
163#ifdef USE_MEDIA_PLAYER
164 if (this->media_player_ != nullptr) {
167 }
168#endif
169
170 return flags;
171 }
172
173 void request_start(bool continuous, bool silence_detection);
174 void request_stop();
175
177 void on_audio(const api::VoiceAssistantAudio &msg);
180 void on_set_configuration(const std::vector<std::string> &active_wake_words);
182
183 bool is_running() const { return this->state_ != State::IDLE; }
184 void set_continuous(bool continuous) { this->continuous_ = continuous; }
185 bool is_continuous() const { return this->continuous_; }
186
187 void set_use_wake_word(bool use_wake_word) { this->use_wake_word_ = use_wake_word; }
188
189 void set_noise_suppression_level(uint8_t noise_suppression_level) {
190 this->noise_suppression_level_ = noise_suppression_level;
191 }
192 void set_auto_gain(uint8_t auto_gain) { this->auto_gain_ = auto_gain; }
193 void set_volume_multiplier(float volume_multiplier) { this->volume_multiplier_ = volume_multiplier; }
194 void set_conversation_timeout(uint32_t conversation_timeout) { this->conversation_timeout_ = conversation_timeout; }
196
205#ifdef USE_SPEAKER
208#endif
215
218
219 void client_subscription(api::APIConnection *client, bool subscribe);
221
222 void set_wake_word(const std::string &wake_word) { this->wake_word_ = wake_word; }
223
229 void set_has_timers(bool has_timers) { this->has_timers_ = has_timers; }
230 const std::vector<Timer> &get_timers() const { return this->timers_; }
231
232 protected:
233 bool allocate_buffers_();
234 void clear_buffers_();
235 void deallocate_buffers_();
236
237 void set_state_(State state);
238 void set_state_(State state, State desired_state);
239 void signal_stop_();
241
242 std::unique_ptr<socket::Socket> socket_ = nullptr;
244
252#ifdef USE_SPEAKER
255#endif
263
266
268
269 std::vector<Timer> timers_;
270 void timer_tick_();
276 bool has_timers_{false};
278
280#ifdef USE_SPEAKER
281 void write_speaker_();
283 uint8_t *speaker_buffer_{nullptr};
288 bool stream_ended_{false};
289#endif
290#ifdef USE_MEDIA_PLAYER
292 std::string tts_response_url_{""};
294
296#endif
297
298 bool local_output_{false};
299
300 std::string conversation_id_{""};
301
302 std::string wake_word_{""};
303
304 std::shared_ptr<RingBuffer> ring_buffer_;
305
308 uint8_t auto_gain_;
311
312 uint8_t *send_buffer_{nullptr};
313
314 bool continuous_{false};
316
318
321
324 bool start_udp_socket_();
325
327
328#ifdef USE_MICRO_WAKE_WORD
330#endif
331};
332
333template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> {
334 TEMPLATABLE_VALUE(std::string, wake_word);
335
336 public:
337 void play(const Ts &...x) override {
338 this->parent_->set_wake_word(this->wake_word_.value(x...));
339 this->parent_->request_start(false, this->silence_detection_);
340 }
341
342 void set_silence_detection(bool silence_detection) { this->silence_detection_ = silence_detection; }
343
344 protected:
346};
347
348template<typename... Ts> class StartContinuousAction : public Action<Ts...>, public Parented<VoiceAssistant> {
349 public:
350 void play(const Ts &...x) override { this->parent_->request_start(true, true); }
351};
352
353template<typename... Ts> class StopAction : public Action<Ts...>, public Parented<VoiceAssistant> {
354 public:
355 void play(const Ts &...x) override { this->parent_->request_stop(); }
356};
357
358template<typename... Ts> class IsRunningCondition : public Condition<Ts...>, public Parented<VoiceAssistant> {
359 public:
360 bool check(const Ts &...x) override { return this->parent_->is_running() || this->parent_->is_continuous(); }
361};
362
363template<typename... Ts> class ConnectedCondition : public Condition<Ts...>, public Parented<VoiceAssistant> {
364 public:
365 bool check(const Ts &...x) override { return this->parent_->get_api_connection() != nullptr; }
366};
367
368extern VoiceAssistant *global_voice_assistant; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
369
370} // namespace voice_assistant
371} // namespace esphome
372
373#endif // USE_VOICE_ASSISTANT
Base class for all automation conditions.
Definition automation.h:304
Helper class to easily give an object a parent of type T.
Definition helpers.h:1618
void play(const Ts &...x) override
void set_silence_detection(bool silence_detection)
void play(const Ts &...x) override
std::unique_ptr< socket::Socket > socket_
void set_conversation_timeout(uint32_t conversation_timeout)
Trigger< std::string > * get_stt_end_trigger()
Trigger< std::string > * get_intent_progress_trigger()
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
Trigger< std::string > * get_tts_end_trigger()
const std::vector< Timer > & get_timers() const
media_player::MediaPlayer * media_player_
void set_media_player(media_player::MediaPlayer *media_player)
void client_subscription(api::APIConnection *client, bool subscribe)
MediaPlayerResponseState media_player_response_state_
std::shared_ptr< RingBuffer > ring_buffer_
void on_event(const api::VoiceAssistantEventResponse &msg)
Trigger< std::string, std::string > error_trigger_
Trigger< const std::vector< Timer > & > timer_tick_trigger_
Trigger< std::string > intent_progress_trigger_
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
void request_start(bool continuous, bool silence_detection)
void set_speaker(speaker::Speaker *speaker)
Trigger< std::string > * get_tts_start_trigger()
api::APIConnection * get_api_connection() const
void set_microphone_source(microphone::MicrophoneSource *mic_source)
void set_wake_word(const std::string &wake_word)
void set_micro_wake_word(micro_wake_word::MicroWakeWord *mww)
void set_volume_multiplier(float volume_multiplier)
microphone::MicrophoneSource * mic_source_
micro_wake_word::MicroWakeWord * micro_wake_word_
Trigger< std::string, std::string > * get_error_trigger()
void set_noise_suppression_level(uint8_t noise_suppression_level)
void on_set_configuration(const std::vector< std::string > &active_wake_words)
Trigger< const std::vector< Timer > & > * get_timer_tick_trigger()
uint16_t flags
bool state
Definition fan.h:2
VoiceAssistant * global_voice_assistant
Providing packet encoding functions for exchanging data with a remote host.
Definition a01nyub.cpp:7
struct ESPDEPRECATED("Use std::index_sequence instead. Removed in 2026.6.0", "2025.12.0") seq
Definition automation.h:26
std::vector< WakeWord > available_wake_words
std::vector< std::string > active_wake_words
static constexpr size_t TO_STR_BUFFER_SIZE
Buffer size for to_str() - sufficient for typical timer names.
const char * to_str(std::span< char, TO_STR_BUFFER_SIZE > buffer) const
Format to buffer, returns pointer to buffer (may truncate long names)
std::vector< std::string > trained_languages
uint16_t x
Definition tt21100.cpp:5