ESPHome: esphome/components/audio/audio_resampler.cpp Source File

#include "audio_resampler.h"


#ifdef USE_ESP32


#include "esphome/core/hal.h"


#include <cstring>


namespace esphome::audio {


static const uint32_t READ_WRITE_TIMEOUT_MS = 20;


AudioResampler::AudioResampler(size_t input_buffer_size, size_t output_buffer_size)

    : input_buffer_size_(input_buffer_size), output_buffer_size_(output_buffer_size) {

  this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(output_buffer_size);

}


esp_err_t AudioResampler::add_source(std::weak_ptr<ring_buffer::RingBuffer> &input_ring_buffer) {

  // The zero-copy RingBufferAudioSource is created lazily on the first resample() call, once both the ring

  // buffer (stored here) and the input stream info (set by start()) are available, in either order.

  this->source_ring_buffer_ = input_ring_buffer.lock();

  if (this->source_ring_buffer_ == nullptr) {

    return ESP_ERR_INVALID_STATE;

  }

  return ESP_OK;

}


esp_err_t AudioResampler::add_sink(std::weak_ptr<ring_buffer::RingBuffer> &output_ring_buffer) {

  if (this->output_transfer_buffer_ != nullptr) {

    this->output_transfer_buffer_->set_sink(output_ring_buffer);

    return ESP_OK;

  }

  return ESP_ERR_NO_MEM;

}


#ifdef USE_SPEAKER


esp_err_t AudioResampler::add_sink(speaker::Speaker *speaker) {

  if (this->output_transfer_buffer_ != nullptr) {

    this->output_transfer_buffer_->set_sink(speaker);

    return ESP_OK;

  }

  return ESP_ERR_NO_MEM;

}


#endif


esp_err_t AudioResampler::start(AudioStreamInfo &input_stream_info, AudioStreamInfo &output_stream_info,

                                uint16_t number_of_taps, uint16_t number_of_filters) {

  this->input_stream_info_ = input_stream_info;

  this->output_stream_info_ = output_stream_info;


  if (this->output_transfer_buffer_ == nullptr) {

    return ESP_ERR_NO_MEM;

  }


  if ((input_stream_info.get_bits_per_sample() > 32) || (output_stream_info.get_bits_per_sample() > 32) ||

      (input_stream_info_.get_channels() != output_stream_info.get_channels())) {

    return ESP_ERR_NOT_SUPPORTED;

  }


  // Reject frame sizes that can't be used as the zero-copy source's alignment up front, where the caller checks

  // the return code. The lazy create() in resample() keeps its own guard since it runs before the uint8_t cast.

  const size_t bytes_per_frame = this->input_stream_info_.frames_to_bytes(1);

  if ((bytes_per_frame == 0) || (bytes_per_frame > RingBufferAudioSource::MAX_ALIGNMENT_BYTES)) {

    return ESP_ERR_NOT_SUPPORTED;

  }


  if ((input_stream_info.get_sample_rate() != output_stream_info.get_sample_rate()) ||

      (input_stream_info.get_bits_per_sample() != output_stream_info.get_bits_per_sample())) {

    this->resampler_ = make_unique<esp_audio_libs::resampler::Resampler>(

        input_stream_info.bytes_to_samples(this->input_buffer_size_),

        output_stream_info.bytes_to_samples(this->output_buffer_size_));


    // Use cascaded biquad filters when downsampling to avoid aliasing

    bool use_pre_filter = output_stream_info.get_sample_rate() < input_stream_info.get_sample_rate();


    esp_audio_libs::resampler::ResamplerConfiguration resample_config = {

        .source_sample_rate = static_cast<float>(input_stream_info.get_sample_rate()),

        .target_sample_rate = static_cast<float>(output_stream_info.get_sample_rate()),

        .source_bits_per_sample = input_stream_info.get_bits_per_sample(),

        .target_bits_per_sample = output_stream_info.get_bits_per_sample(),

        .channels = input_stream_info_.get_channels(),

        .use_pre_or_post_filter = use_pre_filter,

        .subsample_interpolate = false,  // Doubles the CPU load. Using more filters is a better alternative

        .number_of_taps = number_of_taps,

        .number_of_filters = number_of_filters,

    };


    if (!this->resampler_->initialize(resample_config)) {

      // Failed to allocate the resampler's internal buffers

      return ESP_ERR_NO_MEM;

    }

  }


  return ESP_OK;

}


AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_differential) {

  if (this->audio_source_ == nullptr) {

    // Lazily create the zero-copy source on first use. Frame-aligned reads ensure multi-channel frames are

    // never split across the ring buffer's wrap boundary.

    const size_t bytes_per_frame = this->input_stream_info_.frames_to_bytes(1);

    if ((bytes_per_frame == 0) || (bytes_per_frame > RingBufferAudioSource::MAX_ALIGNMENT_BYTES)) {

      // Stream info is unset or the frame is too large to use as an alignment; the uint8_t cast below would

      // truncate it and could yield a source that tears frames.

      return AudioResamplerState::FAILED;

    }

    // Pass the shared_ptr by copy so a failed create() leaves source_ring_buffer_ intact; release our

    // reference only after the source has taken ownership.

    this->audio_source_ = RingBufferAudioSource::create(this->source_ring_buffer_, this->input_buffer_size_,

                                                        static_cast<uint8_t>(bytes_per_frame));

    if (this->audio_source_ == nullptr) {

      return AudioResamplerState::FAILED;

    }

    this->source_ring_buffer_.reset();

  }


  if (stop_gracefully) {

    if (!this->audio_source_->has_buffered_data() && (this->output_transfer_buffer_->available() == 0)) {

      return AudioResamplerState::FINISHED;

    }

  }


  if (!this->pause_output_) {

    // Move audio data to the sink without shifting the data in the output transfer buffer to avoid unnecessary, slow

    // data moves

    this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS), false);

  } else {

    // If paused, block to avoid wasting CPU resources

    delay(READ_WRITE_TIMEOUT_MS);

  }


  // Expose a chunk of the ring buffer's internal storage. pre_shift is ignored by RingBufferAudioSource

  // (there is no intermediate transfer buffer to compact).

  this->audio_source_->fill(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS), false);


  if (this->audio_source_->available() == 0) {

    // No samples available to process

    return AudioResamplerState::RESAMPLING;

  }


  const size_t bytes_free = this->output_transfer_buffer_->free();

  const uint32_t frames_free = this->output_stream_info_.bytes_to_frames(bytes_free);


  const size_t bytes_available = this->audio_source_->available();

  const uint32_t frames_available = this->input_stream_info_.bytes_to_frames(bytes_available);


  if ((this->input_stream_info_.get_sample_rate() != this->output_stream_info_.get_sample_rate()) ||

      (this->input_stream_info_.get_bits_per_sample() != this->output_stream_info_.get_bits_per_sample())) {

    // Adjust gain by -3 dB to avoid clipping due to the resampling process

    esp_audio_libs::resampler::ResamplerResults results =

        this->resampler_->resample(this->audio_source_->data(), this->output_transfer_buffer_->get_buffer_end(),

                                   frames_available, frames_free, -3);


    this->audio_source_->consume(this->input_stream_info_.frames_to_bytes(results.frames_used));

    this->output_transfer_buffer_->increase_buffer_length(

        this->output_stream_info_.frames_to_bytes(results.frames_generated));


    // Resampling causes slight differences in the durations used versus generated. Computes the difference in

    // millisconds. The callback function passing the played audio duration uses the difference to convert from output

    // duration to input duration.

    this->accumulated_frames_used_ += results.frames_used;

    this->accumulated_frames_generated_ += results.frames_generated;


    const int32_t used_ms =

        this->input_stream_info_.frames_to_milliseconds_with_remainder(&this->accumulated_frames_used_);

    const int32_t generated_ms =

        this->output_stream_info_.frames_to_milliseconds_with_remainder(&this->accumulated_frames_generated_);


    *ms_differential = used_ms - generated_ms;


  } else {

    // No resampling required, copy samples directly to the output transfer buffer

    *ms_differential = 0;


    const size_t bytes_to_transfer = std::min(this->output_stream_info_.frames_to_bytes(frames_free),

                                              this->input_stream_info_.frames_to_bytes(frames_available));


    std::memcpy((void *) this->output_transfer_buffer_->get_buffer_end(), (const void *) this->audio_source_->data(),

                bytes_to_transfer);


    this->audio_source_->consume(bytes_to_transfer);

    this->output_transfer_buffer_->increase_buffer_length(bytes_to_transfer);

  }


  return AudioResamplerState::RESAMPLING;

}


}  // namespace esphome::audio


#endif

audio_resampler.h

esphome::audio::AudioResampler::pause_output_
bool pause_output_
Definition audio_resampler.h:93

esphome::audio::AudioResampler::resample
AudioResamplerState resample(bool stop_gracefully, int32_t *ms_differential)
Resamples audio from the ring buffer source and writes to the sink.
Definition audio_resampler.cpp:97

esphome::audio::AudioResampler::input_buffer_size_
size_t input_buffer_size_
Definition audio_resampler.h:87

esphome::audio::AudioResampler::source_ring_buffer_
std::shared_ptr< ring_buffer::RingBuffer > source_ring_buffer_
Definition audio_resampler.h:83

esphome::audio::AudioResampler::input_stream_info_
AudioStreamInfo input_stream_info_
Definition audio_resampler.h:95

esphome::audio::AudioResampler::resampler_
std::unique_ptr< esp_audio_libs::resampler::Resampler > resampler_
Definition audio_resampler.h:98

esphome::audio::AudioResampler::accumulated_frames_generated_
uint32_t accumulated_frames_generated_
Definition audio_resampler.h:91

esphome::audio::AudioResampler::add_source
esp_err_t add_source(std::weak_ptr< ring_buffer::RingBuffer > &input_ring_buffer)
Sets the ring buffer the audio is read from and takes shared ownership of it.
Definition audio_resampler.cpp:18

esphome::audio::AudioResampler::output_transfer_buffer_
std::unique_ptr< AudioSinkTransferBuffer > output_transfer_buffer_
Definition audio_resampler.h:85

esphome::audio::AudioResampler::add_sink
esp_err_t add_sink(std::weak_ptr< ring_buffer::RingBuffer > &output_ring_buffer)
Adds a sink ring buffer for resampled audio.
Definition audio_resampler.cpp:28

esphome::audio::AudioResampler::AudioResampler
AudioResampler(size_t input_buffer_size, size_t output_buffer_size)
Allocates the output transfer buffer.
Definition audio_resampler.cpp:13

esphome::audio::AudioResampler::start
esp_err_t start(AudioStreamInfo &input_stream_info, AudioStreamInfo &output_stream_info, uint16_t number_of_taps, uint16_t number_of_filters)
Sets up the class to resample.
Definition audio_resampler.cpp:46

esphome::audio::AudioResampler::accumulated_frames_used_
uint32_t accumulated_frames_used_
Definition audio_resampler.h:90

esphome::audio::AudioResampler::output_stream_info_
AudioStreamInfo output_stream_info_
Definition audio_resampler.h:96

esphome::audio::AudioResampler::audio_source_
std::unique_ptr< RingBufferAudioSource > audio_source_
Definition audio_resampler.h:84

esphome::audio::AudioSinkTransferBuffer::create
static std::unique_ptr< AudioSinkTransferBuffer > create(size_t buffer_size)
Creates a new sink transfer buffer.
Definition audio_transfer_buffer.cpp:13

esphome::audio::AudioStreamInfo
Definition audio.h:11

esphome::audio::AudioStreamInfo::frames_to_bytes
size_t frames_to_bytes(uint32_t frames) const
Converts frames to bytes.
Definition audio.h:53

esphome::audio::AudioStreamInfo::get_bits_per_sample
uint8_t get_bits_per_sample() const
Definition audio.h:28

esphome::audio::AudioStreamInfo::bytes_to_frames
uint32_t bytes_to_frames(size_t bytes) const
Convert bytes to frames.
Definition audio.h:43

esphome::audio::AudioStreamInfo::get_channels
uint8_t get_channels() const
Definition audio.h:29

esphome::audio::AudioStreamInfo::frames_to_milliseconds_with_remainder
uint32_t frames_to_milliseconds_with_remainder(uint32_t *frames) const
Computes the duration, in milliseconds, the given amount of frames represents.
Definition audio.cpp:29

esphome::audio::AudioStreamInfo::get_sample_rate
uint32_t get_sample_rate() const
Definition audio.h:30

esphome::audio::AudioStreamInfo::bytes_to_samples
uint32_t bytes_to_samples(size_t bytes) const
Convert bytes to samples.
Definition audio.h:48

esphome::audio::RingBufferAudioSource::MAX_ALIGNMENT_BYTES
static constexpr size_t MAX_ALIGNMENT_BYTES
Maximum supported alignment. Sized to cover 32-bit samples across up to 2 channels (8 bytes).
Definition audio_transfer_buffer.h:230

esphome::audio::RingBufferAudioSource::create
static std::unique_ptr< RingBufferAudioSource > create(std::shared_ptr< ring_buffer::RingBuffer > ring_buffer, size_t max_fill_bytes, uint8_t alignment_bytes=1)
Creates a new ring-buffer-backed audio source after validating its parameters.
Definition audio_transfer_buffer.cpp:210

esphome::speaker::Speaker
Definition speaker.h:28

hal.h

esphome::audio
Definition audio.cpp:7

esphome::audio::AudioResamplerState
AudioResamplerState
Definition audio_resampler.h:22

esphome::audio::AudioResamplerState::FINISHED
@ FINISHED

esphome::audio::AudioResamplerState::RESAMPLING
@ RESAMPLING

esphome::audio::AudioResamplerState::FAILED
@ FAILED

esphome::delay
void HOT delay(uint32_t ms)
Definition hal.cpp:85

uint32_t
static void uint32_t
Definition crash_handler.cpp:141