ESPHome 2026.6.0-dev
Loading...
Searching...
No Matches
audio_resampler.cpp
Go to the documentation of this file.
1#include "audio_resampler.h"
2
3#ifdef USE_ESP32
4
5#include "esphome/core/hal.h"
6
7#include <cstring>
8
9namespace esphome::audio {
10
11static const uint32_t READ_WRITE_TIMEOUT_MS = 20;
12
13AudioResampler::AudioResampler(size_t input_buffer_size, size_t output_buffer_size)
14 : input_buffer_size_(input_buffer_size), output_buffer_size_(output_buffer_size) {
16}
17
18esp_err_t AudioResampler::add_source(std::weak_ptr<ring_buffer::RingBuffer> &input_ring_buffer) {
19 // The zero-copy RingBufferAudioSource is created lazily on the first resample() call, once both the ring
20 // buffer (stored here) and the input stream info (set by start()) are available, in either order.
21 this->source_ring_buffer_ = input_ring_buffer.lock();
22 if (this->source_ring_buffer_ == nullptr) {
23 return ESP_ERR_INVALID_STATE;
24 }
25 return ESP_OK;
26}
27
28esp_err_t AudioResampler::add_sink(std::weak_ptr<ring_buffer::RingBuffer> &output_ring_buffer) {
29 if (this->output_transfer_buffer_ != nullptr) {
30 this->output_transfer_buffer_->set_sink(output_ring_buffer);
31 return ESP_OK;
32 }
33 return ESP_ERR_NO_MEM;
34}
35
36#ifdef USE_SPEAKER
38 if (this->output_transfer_buffer_ != nullptr) {
39 this->output_transfer_buffer_->set_sink(speaker);
40 return ESP_OK;
41 }
42 return ESP_ERR_NO_MEM;
43}
44#endif
45
46esp_err_t AudioResampler::start(AudioStreamInfo &input_stream_info, AudioStreamInfo &output_stream_info,
47 uint16_t number_of_taps, uint16_t number_of_filters) {
48 this->input_stream_info_ = input_stream_info;
49 this->output_stream_info_ = output_stream_info;
50
51 if (this->output_transfer_buffer_ == nullptr) {
52 return ESP_ERR_NO_MEM;
53 }
54
55 if ((input_stream_info.get_bits_per_sample() > 32) || (output_stream_info.get_bits_per_sample() > 32) ||
56 (input_stream_info_.get_channels() != output_stream_info.get_channels())) {
57 return ESP_ERR_NOT_SUPPORTED;
58 }
59
60 // Reject frame sizes that can't be used as the zero-copy source's alignment up front, where the caller checks
61 // the return code. The lazy create() in resample() keeps its own guard since it runs before the uint8_t cast.
62 const size_t bytes_per_frame = this->input_stream_info_.frames_to_bytes(1);
63 if ((bytes_per_frame == 0) || (bytes_per_frame > RingBufferAudioSource::MAX_ALIGNMENT_BYTES)) {
64 return ESP_ERR_NOT_SUPPORTED;
65 }
66
67 if ((input_stream_info.get_sample_rate() != output_stream_info.get_sample_rate()) ||
68 (input_stream_info.get_bits_per_sample() != output_stream_info.get_bits_per_sample())) {
69 this->resampler_ = make_unique<esp_audio_libs::resampler::Resampler>(
70 input_stream_info.bytes_to_samples(this->input_buffer_size_),
71 output_stream_info.bytes_to_samples(this->output_buffer_size_));
72
73 // Use cascaded biquad filters when downsampling to avoid aliasing
74 bool use_pre_filter = output_stream_info.get_sample_rate() < input_stream_info.get_sample_rate();
75
76 esp_audio_libs::resampler::ResamplerConfiguration resample_config = {
77 .source_sample_rate = static_cast<float>(input_stream_info.get_sample_rate()),
78 .target_sample_rate = static_cast<float>(output_stream_info.get_sample_rate()),
79 .source_bits_per_sample = input_stream_info.get_bits_per_sample(),
80 .target_bits_per_sample = output_stream_info.get_bits_per_sample(),
81 .channels = input_stream_info_.get_channels(),
82 .use_pre_or_post_filter = use_pre_filter,
83 .subsample_interpolate = false, // Doubles the CPU load. Using more filters is a better alternative
84 .number_of_taps = number_of_taps,
85 .number_of_filters = number_of_filters,
86 };
87
88 if (!this->resampler_->initialize(resample_config)) {
89 // Failed to allocate the resampler's internal buffers
90 return ESP_ERR_NO_MEM;
91 }
92 }
93
94 return ESP_OK;
95}
96
97AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_differential) {
98 if (this->audio_source_ == nullptr) {
99 // Lazily create the zero-copy source on first use. Frame-aligned reads ensure multi-channel frames are
100 // never split across the ring buffer's wrap boundary.
101 const size_t bytes_per_frame = this->input_stream_info_.frames_to_bytes(1);
102 if ((bytes_per_frame == 0) || (bytes_per_frame > RingBufferAudioSource::MAX_ALIGNMENT_BYTES)) {
103 // Stream info is unset or the frame is too large to use as an alignment; the uint8_t cast below would
104 // truncate it and could yield a source that tears frames.
106 }
107 // Pass the shared_ptr by copy so a failed create() leaves source_ring_buffer_ intact; release our
108 // reference only after the source has taken ownership.
110 static_cast<uint8_t>(bytes_per_frame));
111 if (this->audio_source_ == nullptr) {
113 }
114 this->source_ring_buffer_.reset();
115 }
116
117 if (stop_gracefully) {
118 if (!this->audio_source_->has_buffered_data() && (this->output_transfer_buffer_->available() == 0)) {
120 }
121 }
122
123 if (!this->pause_output_) {
124 // Move audio data to the sink without shifting the data in the output transfer buffer to avoid unnecessary, slow
125 // data moves
126 this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS), false);
127 } else {
128 // If paused, block to avoid wasting CPU resources
129 delay(READ_WRITE_TIMEOUT_MS);
130 }
131
132 // Expose a chunk of the ring buffer's internal storage. pre_shift is ignored by RingBufferAudioSource
133 // (there is no intermediate transfer buffer to compact).
134 this->audio_source_->fill(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS), false);
135
136 if (this->audio_source_->available() == 0) {
137 // No samples available to process
139 }
140
141 const size_t bytes_free = this->output_transfer_buffer_->free();
142 const uint32_t frames_free = this->output_stream_info_.bytes_to_frames(bytes_free);
143
144 const size_t bytes_available = this->audio_source_->available();
145 const uint32_t frames_available = this->input_stream_info_.bytes_to_frames(bytes_available);
146
147 if ((this->input_stream_info_.get_sample_rate() != this->output_stream_info_.get_sample_rate()) ||
148 (this->input_stream_info_.get_bits_per_sample() != this->output_stream_info_.get_bits_per_sample())) {
149 // Adjust gain by -3 dB to avoid clipping due to the resampling process
150 esp_audio_libs::resampler::ResamplerResults results =
151 this->resampler_->resample(this->audio_source_->data(), this->output_transfer_buffer_->get_buffer_end(),
152 frames_available, frames_free, -3);
153
154 this->audio_source_->consume(this->input_stream_info_.frames_to_bytes(results.frames_used));
155 this->output_transfer_buffer_->increase_buffer_length(
156 this->output_stream_info_.frames_to_bytes(results.frames_generated));
157
158 // Resampling causes slight differences in the durations used versus generated. Computes the difference in
159 // millisconds. The callback function passing the played audio duration uses the difference to convert from output
160 // duration to input duration.
161 this->accumulated_frames_used_ += results.frames_used;
162 this->accumulated_frames_generated_ += results.frames_generated;
163
164 const int32_t used_ms =
166 const int32_t generated_ms =
168
169 *ms_differential = used_ms - generated_ms;
170
171 } else {
172 // No resampling required, copy samples directly to the output transfer buffer
173 *ms_differential = 0;
174
175 const size_t bytes_to_transfer = std::min(this->output_stream_info_.frames_to_bytes(frames_free),
176 this->input_stream_info_.frames_to_bytes(frames_available));
177
178 std::memcpy((void *) this->output_transfer_buffer_->get_buffer_end(), (const void *) this->audio_source_->data(),
179 bytes_to_transfer);
180
181 this->audio_source_->consume(bytes_to_transfer);
182 this->output_transfer_buffer_->increase_buffer_length(bytes_to_transfer);
183 }
184
186}
187
188} // namespace esphome::audio
189
190#endif
AudioResamplerState resample(bool stop_gracefully, int32_t *ms_differential)
Resamples audio from the ring buffer source and writes to the sink.
std::shared_ptr< ring_buffer::RingBuffer > source_ring_buffer_
std::unique_ptr< esp_audio_libs::resampler::Resampler > resampler_
esp_err_t add_source(std::weak_ptr< ring_buffer::RingBuffer > &input_ring_buffer)
Sets the ring buffer the audio is read from and takes shared ownership of it.
std::unique_ptr< AudioSinkTransferBuffer > output_transfer_buffer_
esp_err_t add_sink(std::weak_ptr< ring_buffer::RingBuffer > &output_ring_buffer)
Adds a sink ring buffer for resampled audio.
AudioResampler(size_t input_buffer_size, size_t output_buffer_size)
Allocates the output transfer buffer.
esp_err_t start(AudioStreamInfo &input_stream_info, AudioStreamInfo &output_stream_info, uint16_t number_of_taps, uint16_t number_of_filters)
Sets up the class to resample.
std::unique_ptr< RingBufferAudioSource > audio_source_
static std::unique_ptr< AudioSinkTransferBuffer > create(size_t buffer_size)
Creates a new sink transfer buffer.
size_t frames_to_bytes(uint32_t frames) const
Converts frames to bytes.
Definition audio.h:53
uint8_t get_bits_per_sample() const
Definition audio.h:28
uint32_t bytes_to_frames(size_t bytes) const
Convert bytes to frames.
Definition audio.h:43
uint8_t get_channels() const
Definition audio.h:29
uint32_t frames_to_milliseconds_with_remainder(uint32_t *frames) const
Computes the duration, in milliseconds, the given amount of frames represents.
Definition audio.cpp:29
uint32_t get_sample_rate() const
Definition audio.h:30
uint32_t bytes_to_samples(size_t bytes) const
Convert bytes to samples.
Definition audio.h:48
static constexpr size_t MAX_ALIGNMENT_BYTES
Maximum supported alignment. Sized to cover 32-bit samples across up to 2 channels (8 bytes).
static std::unique_ptr< RingBufferAudioSource > create(std::shared_ptr< ring_buffer::RingBuffer > ring_buffer, size_t max_fill_bytes, uint8_t alignment_bytes=1)
Creates a new ring-buffer-backed audio source after validating its parameters.
void HOT delay(uint32_t ms)
Definition hal.cpp:85
static void uint32_t