4#ifdef USE_VOICE_ASSISTANT
12namespace voice_assistant {
14static const char *
const TAG =
"voice_assistant";
20static const size_t SAMPLE_RATE_HZ = 16000;
22static const size_t RING_BUFFER_SAMPLES = 512 * SAMPLE_RATE_HZ / 1000;
23static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES *
sizeof(int16_t);
24static const size_t SEND_BUFFER_SAMPLES = 32 * SAMPLE_RATE_HZ / 1000;
25static const size_t SEND_BUFFER_SIZE = SEND_BUFFER_SAMPLES *
sizeof(int16_t);
26static const size_t RECEIVE_SIZE = 1024;
27static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
33 std::shared_ptr<RingBuffer> temp_ring_buffer = this->
ring_buffer_;
35 temp_ring_buffer->write((
void *) data.data(), data.size());
39#ifdef USE_MEDIA_PLAYER
66 ESP_LOGE(TAG,
"Could not create socket");
71 int err = this->
socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable,
sizeof(
int));
73 ESP_LOGW(TAG,
"Socket unable to set reuseaddr: errno %d", err);
76 err = this->
socket_->setblocking(
false);
78 ESP_LOGE(TAG,
"Socket unable to set nonblocking mode: errno %d", err);
89 ESP_LOGE(TAG,
"Socket unable to set sockaddr: errno %d", errno);
96 ESP_LOGE(TAG,
"Socket unable to bind: errno %d", errno);
112 ESP_LOGW(TAG,
"Could not allocate speaker buffer");
121 ESP_LOGE(TAG,
"Could not allocate ring buffer");
130 ESP_LOGW(TAG,
"Could not allocate send buffer");
180 ESP_LOGD(TAG,
"reset conversation ID");
207 ESP_LOGD(TAG,
"Starting Microphone");
228 ESP_LOGD(TAG,
"Requesting start");
248#ifdef USE_MEDIA_PLAYER
256 ESP_LOGW(TAG,
"Could not request start");
272 while (available >= SEND_BUFFER_SIZE) {
276 msg.
set_data(this->send_buffer_, read_bytes);
286 sizeof(this->dest_addr_));
312 bool playing =
false;
319 if (received_len > 0) {
325 ESP_LOGD(TAG,
"Receive buffer full");
335 ESP_LOGD(TAG,
"End of audio stream received");
344#ifdef USE_MEDIA_PLAYER
351 ESP_LOGD(TAG,
"Announcement finished playing");
376 ESP_LOGD(TAG,
"Speaker has finished outputting all audio");
413 ESP_LOGV(TAG,
"Speaker buffer full, trying again next loop");
423 ESP_LOGE(TAG,
"Client attempting to unsubscribe that is not the current API Client");
432 ESP_LOGE(TAG,
"Multiple API Clients attempting to connect to Voice Assistant");
442static const LogString *voice_assistant_state_to_string(
State state) {
445 return LOG_STR(
"IDLE");
447 return LOG_STR(
"START_MICROPHONE");
449 return LOG_STR(
"STARTING_MICROPHONE");
451 return LOG_STR(
"WAIT_FOR_VAD");
453 return LOG_STR(
"WAITING_FOR_VAD");
455 return LOG_STR(
"START_PIPELINE");
457 return LOG_STR(
"STARTING_PIPELINE");
459 return LOG_STR(
"STREAMING_MICROPHONE");
461 return LOG_STR(
"STOP_MICROPHONE");
463 return LOG_STR(
"STOPPING_MICROPHONE");
465 return LOG_STR(
"AWAITING_RESPONSE");
467 return LOG_STR(
"STREAMING_RESPONSE");
469 return LOG_STR(
"RESPONSE_FINISHED");
471 return LOG_STR(
"UNKNOWN");
478 ESP_LOGD(TAG,
"State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
479 LOG_STR_ARG(voice_assistant_state_to_string(
state)));
485 ESP_LOGD(TAG,
"Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
489 ESP_LOGE(TAG,
"Failed to start server. See Home Assistant logs for more details.");
490 this->
error_trigger_->
trigger(
"failed-to-start",
"Failed to start server. See Home Assistant logs for more details.");
500 ESP_LOGD(TAG,
"Client started, streaming microphone");
516 ESP_LOGD(TAG,
"Client started, streaming microphone");
542 ESP_LOGE(TAG,
"No API client connected");
582#ifdef USE_MEDIA_PLAYER
606 ESP_LOGD(TAG,
"Signaling stop");
624 ESP_LOGD(TAG,
"Event Type: %" PRId32, msg.
event_type);
627 ESP_LOGD(TAG,
"Assist Pipeline running");
628#ifdef USE_MEDIA_PLAYER
630 for (
auto arg : msg.
data) {
631 if (arg.name ==
"url") {
641 ESP_LOGD(TAG,
"Wake word detected");
646 ESP_LOGD(TAG,
"STT started");
651 for (
auto arg : msg.
data) {
652 if (arg.name ==
"text") {
653 text = std::move(arg.value);
657 ESP_LOGW(TAG,
"No text in STT_END event");
659 }
else if (text.length() > 500) {
660 text = text.substr(0, 497) +
"...";
662 ESP_LOGD(TAG,
"Speech recognised as: \"%s\"", text.c_str());
667 ESP_LOGD(TAG,
"Intent started");
671 ESP_LOGD(TAG,
"Intent progress");
672 std::string tts_url_for_trigger =
"";
673#ifdef USE_MEDIA_PLAYER
675 for (
const auto &arg : msg.
data) {
676 if ((arg.name ==
"tts_start_streaming") && (arg.value ==
"1") && !this->tts_response_url_.empty()) {
695 for (
auto arg : msg.
data) {
696 if (arg.name ==
"conversation_id") {
698 }
else if (arg.name ==
"continue_conversation") {
707 for (
auto arg : msg.
data) {
708 if (arg.name ==
"text") {
709 text = std::move(arg.value);
713 ESP_LOGW(TAG,
"No text in TTS_START event");
716 if (text.length() > 500) {
717 text = text.substr(0, 497) +
"...";
719 ESP_LOGD(TAG,
"Response: \"%s\"", text.c_str());
720 this->
defer([
this, text]() {
732 for (
auto arg : msg.
data) {
733 if (arg.name ==
"url") {
734 url = std::move(arg.value);
738 ESP_LOGW(TAG,
"No url in TTS_END event");
741 ESP_LOGD(TAG,
"Response URL: \"%s\"", url.c_str());
742 this->
defer([
this, url]() {
743#ifdef USE_MEDIA_PLAYER
756 if (new_state != this->
state_) {
764 ESP_LOGD(TAG,
"Assist Pipeline ended");
777 std::string code =
"";
778 std::string message =
"";
779 for (
auto arg : msg.
data) {
780 if (arg.name ==
"code") {
781 code = std::move(arg.value);
782 }
else if (arg.name ==
"message") {
783 message = std::move(arg.value);
786 if (code ==
"wake-word-timeout" || code ==
"wake_word_detection_aborted" || code ==
"no_wake_word") {
789 }
else if (code ==
"wake-provider-missing" || code ==
"wake-engine-missing") {
791 this->
defer([
this, code, message]() {
797 ESP_LOGE(TAG,
"Error: %s - %s", code.c_str(), message.c_str());
809 ESP_LOGD(TAG,
"TTS stream start");
819 ESP_LOGD(TAG,
"TTS stream end");
825 ESP_LOGD(TAG,
"Starting STT by VAD");
829 ESP_LOGD(TAG,
"STT by VAD end");
834 ESP_LOGD(TAG,
"Unhandled event type: %" PRId32, msg.
event_type);
847 ESP_LOGV(TAG,
"Received audio: %u bytes from API", msg.
data.length());
849 ESP_LOGE(TAG,
"Cannot receive audio, buffer is full");
864 ESP_LOGD(TAG,
"Timer Event");
865 ESP_LOGD(TAG,
" Type: %" PRId32, msg.
event_type);
866 ESP_LOGD(TAG,
" %s", timer.
to_string().c_str());
895 std::vector<Timer> res;
896 res.reserve(this->
timers_.size());
897 for (
auto &pair : this->
timers_) {
898 auto &timer = pair.second;
899 if (timer.is_active && timer.seconds_left > 0) {
900 timer.seconds_left--;
902 res.push_back(timer);
908#ifdef USE_MEDIA_PLAYER
940#ifdef USE_MICRO_WAKE_WORD
948 for (
auto ww_id : active_wake_words) {
950 if (model->get_id() == ww_id) {
952 ESP_LOGD(TAG,
"Enabled wake word: %s (id=%s)", model->get_wake_word().c_str(), model->get_id().c_str());
964#ifdef USE_MICRO_WAKE_WORD
969 if (model->is_enabled()) {
974 wake_word.
id = model->get_id();
975 wake_word.
wake_word = model->get_wake_word();
976 for (
const auto &lang : model->get_trained_languages()) {
985#ifdef USE_MICRO_WAKE_WORD
virtual void mark_failed()
Mark this component as failed.
void set_interval(const std::string &name, uint32_t interval, std::function< void()> &&f)
Set an interval function with a unique name.
bool cancel_timeout(const std::string &name)
Cancel a timeout function.
void status_clear_error()
bool status_has_error() const
bool cancel_interval(const std::string &name)
Cancel an interval function.
void defer(const std::string &name, std::function< void()> &&f)
Defer a callback to the next loop() call.
void set_timeout(const std::string &name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
void status_set_error(const char *message=nullptr)
An STL allocator that uses SPI or internal RAM.
void deallocate(T *p, size_t n)
static std::unique_ptr< RingBuffer > create(size_t len)
StringRef is a reference to a string owned by something else.
void trigger(Ts... x)
Inform the parent automation that the event has triggered.
std::string get_client_combined_info() const
bool send_message(const ProtoMessage &msg, uint8_t message_type)
static constexpr uint8_t MESSAGE_TYPE
std::string preannounce_media_id
void set_data(const uint8_t *data, size_t len)
static constexpr uint8_t MESSAGE_TYPE
uint32_t noise_suppression_level
enums::VoiceAssistantEvent event_type
std::vector< VoiceAssistantEventData > data
void set_wake_word_phrase(const StringRef &ref)
static constexpr uint8_t MESSAGE_TYPE
VoiceAssistantAudioSettings audio_settings
void set_conversation_id(const StringRef &ref)
enums::VoiceAssistantTimerEvent event_type
std::vector< WakeWordModel * > get_wake_words()
void add_data_callback(std::function< void(const std::vector< uint8_t > &)> &&data_callback)
virtual size_t play(const uint8_t *data, size_t length)=0
Plays the provided audio data.
virtual bool has_buffered_data() const =0
std::unique_ptr< socket::Socket > socket_
Trigger< Timer > * timer_started_trigger_
const Configuration & get_configuration()
size_t speaker_buffer_size_
bool started_streaming_tts_
std::string tts_response_url_
std::unordered_map< std::string, Timer > timers_
bool wait_for_stream_end_
size_t speaker_buffer_index_
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
media_player::MediaPlayer * media_player_
float get_setup_priority() const override
Trigger< std::string > * stt_end_trigger_
void set_state_(State state)
Trigger< Timer > * timer_cancelled_trigger_
Trigger< std::string, std::string > * error_trigger_
uint8_t * speaker_buffer_
void client_subscription(api::APIConnection *client, bool subscribe)
Trigger * stt_vad_end_trigger_
MediaPlayerResponseState media_player_response_state_
void deallocate_buffers_()
Trigger< std::vector< Timer > > * timer_tick_trigger_
std::shared_ptr< RingBuffer > ring_buffer_
uint8_t noise_suppression_level_
void on_event(const api::VoiceAssistantEventResponse &msg)
Trigger * intent_end_trigger_
Trigger * client_disconnected_trigger_
Trigger * wake_word_detected_trigger_
uint32_t conversation_timeout_
Trigger * stt_vad_start_trigger_
Trigger * tts_stream_start_trigger_
Trigger< std::string > * tts_start_trigger_
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
void request_start(bool continuous, bool silence_detection)
api::APIConnection * api_client_
Trigger< std::string > * tts_end_trigger_
struct sockaddr_storage dest_addr_
speaker::Speaker * speaker_
Trigger * tts_stream_end_trigger_
Trigger< Timer > * timer_finished_trigger_
Trigger * intent_start_trigger_
void start_playback_timeout_()
void reset_conversation_id()
std::string conversation_id_
Trigger * client_connected_trigger_
Trigger< std::string > * intent_progress_trigger_
microphone::MicrophoneSource * mic_source_
micro_wake_word::MicroWakeWord * micro_wake_word_
size_t speaker_bytes_received_
Trigger< Timer > * timer_updated_trigger_
bool continue_conversation_
void on_set_configuration(const std::vector< std::string > &active_wake_words)
Trigger * listening_trigger_
@ VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD
@ VOICE_ASSISTANT_REQUEST_USE_VAD
@ VOICE_ASSISTANT_TIMER_UPDATED
@ VOICE_ASSISTANT_TIMER_STARTED
@ VOICE_ASSISTANT_TIMER_FINISHED
@ VOICE_ASSISTANT_TIMER_CANCELLED
@ VOICE_ASSISTANT_INTENT_END
@ VOICE_ASSISTANT_RUN_START
@ VOICE_ASSISTANT_TTS_END
@ VOICE_ASSISTANT_RUN_END
@ VOICE_ASSISTANT_WAKE_WORD_START
@ VOICE_ASSISTANT_TTS_STREAM_END
@ VOICE_ASSISTANT_STT_END
@ VOICE_ASSISTANT_STT_VAD_START
@ VOICE_ASSISTANT_INTENT_PROGRESS
@ VOICE_ASSISTANT_TTS_START
@ VOICE_ASSISTANT_STT_START
@ VOICE_ASSISTANT_INTENT_START
@ VOICE_ASSISTANT_TTS_STREAM_START
@ VOICE_ASSISTANT_STT_VAD_END
@ VOICE_ASSISTANT_WAKE_WORD_END
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
VoiceAssistant * global_voice_assistant
Providing packet encoding functions for exchanging data with a remote host.
std::vector< WakeWord > available_wake_words
std::vector< std::string > active_wake_words
uint32_t max_active_wake_words
std::string to_string() const
std::vector< std::string > trained_languages