diff options
Diffstat (limited to 'webrtc/modules/audio_processing/include/audio_processing.h')
-rw-r--r-- | webrtc/modules/audio_processing/include/audio_processing.h | 1197 |
1 files changed, 584 insertions, 613 deletions
diff --git a/webrtc/modules/audio_processing/include/audio_processing.h b/webrtc/modules/audio_processing/include/audio_processing.h index c8ddc6a..d09e2ba 100644 --- a/webrtc/modules/audio_processing/include/audio_processing.h +++ b/webrtc/modules/audio_processing/include/audio_processing.h @@ -8,139 +8,109 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_ -#define WEBRTC_MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_ +#ifndef MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_ +#define MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_ // MSVC++ requires this to be set before any other includes to get M_PI. +#ifndef _USE_MATH_DEFINES #define _USE_MATH_DEFINES +#endif #include <math.h> #include <stddef.h> // size_t -#include <stdio.h> // FILE -#include <vector> +#include <stdio.h> // FILE +#include <string.h> -#include "webrtc/base/arraysize.h" -#include "webrtc/base/platform_file.h" -#include "webrtc/common.h" -#include "webrtc/modules/audio_processing/beamformer/array_util.h" -#include "webrtc/typedefs.h" +#include <vector> -struct AecCore; +#include "absl/types/optional.h" +#include "api/array_view.h" +#include "api/audio/echo_canceller3_config.h" +#include "api/audio/echo_control.h" +#include "api/scoped_refptr.h" +#include "modules/audio_processing/include/audio_processing_statistics.h" +#include "modules/audio_processing/include/config.h" +#include "rtc_base/arraysize.h" +#include "rtc_base/constructor_magic.h" +#include "rtc_base/deprecation.h" +#include "rtc_base/ref_count.h" +#include "rtc_base/system/file_wrapper.h" +#include "rtc_base/system/rtc_export.h" + +namespace rtc { +class TaskQueue; +} // namespace rtc namespace webrtc { -class AudioFrame; - -template<typename T> -class Beamformer; +class AecDump; +class AudioBuffer; class StreamConfig; class ProcessingConfig; -class EchoCancellation; -class EchoControlMobile; -class GainControl; -class HighPassFilter; -class LevelEstimator; -class NoiseSuppression; -class VoiceDetection; - -// Use to enable the extended filter mode in the AEC, along with robustness -// measures around the reported system delays. It comes with a significant -// increase in AEC complexity, but is much more robust to unreliable reported -// delays. -// -// Detailed changes to the algorithm: -// - The filter length is changed from 48 to 128 ms. This comes with tuning of -// several parameters: i) filter adaptation stepsize and error threshold; -// ii) non-linear processing smoothing and overdrive. -// - Option to ignore the reported delays on platforms which we deem -// sufficiently unreliable. See WEBRTC_UNTRUSTED_DELAY in echo_cancellation.c. -// - Faster startup times by removing the excessive "startup phase" processing -// of reported delays. -// - Much more conservative adjustments to the far-end read pointer. We smooth -// the delay difference more heavily, and back off from the difference more. -// Adjustments force a readaptation of the filter, so they should be avoided -// except when really necessary. -struct ExtendedFilter { - ExtendedFilter() : enabled(false) {} - explicit ExtendedFilter(bool enabled) : enabled(enabled) {} - bool enabled; -}; - -// Enables delay-agnostic echo cancellation. This feature relies on internally -// estimated delays between the process and reverse streams, thus not relying -// on reported system delays. This configuration only applies to -// EchoCancellation and not EchoControlMobile. It can be set in the constructor -// or using AudioProcessing::SetExtraOptions(). -struct DelayAgnostic { - DelayAgnostic() : enabled(false) {} - explicit DelayAgnostic(bool enabled) : enabled(enabled) {} - bool enabled; -}; +class EchoDetector; +class CustomAudioAnalyzer; +class CustomProcessing; // Use to enable experimental gain control (AGC). At startup the experimental // AGC moves the microphone volume up to |startup_min_volume| if the current // microphone volume is set too low. The value is clamped to its operating range // [12, 255]. Here, 255 maps to 100%. // -// Must be provided through AudioProcessing::Create(Confg&). +// Must be provided through AudioProcessingBuilder().Create(config). #if defined(WEBRTC_CHROMIUM_BUILD) static const int kAgcStartupMinVolume = 85; #else static const int kAgcStartupMinVolume = 0; #endif // defined(WEBRTC_CHROMIUM_BUILD) +static constexpr int kClippedLevelMin = 70; + +// To be deprecated: Please instead use the flag in the +// AudioProcessing::Config::AnalogGainController. +// TODO(webrtc:5298): Remove. struct ExperimentalAgc { - ExperimentalAgc() : enabled(true), startup_min_volume(kAgcStartupMinVolume) {} - explicit ExperimentalAgc(bool enabled) - : enabled(enabled), startup_min_volume(kAgcStartupMinVolume) {} + ExperimentalAgc() = default; + explicit ExperimentalAgc(bool enabled) : enabled(enabled) {} + ExperimentalAgc(bool enabled, + bool enabled_agc2_level_estimator, + bool digital_adaptive_disabled) + : enabled(enabled), + enabled_agc2_level_estimator(enabled_agc2_level_estimator), + digital_adaptive_disabled(digital_adaptive_disabled) {} + // Deprecated constructor: will be removed. + ExperimentalAgc(bool enabled, + bool enabled_agc2_level_estimator, + bool digital_adaptive_disabled, + bool analyze_before_aec) + : enabled(enabled), + enabled_agc2_level_estimator(enabled_agc2_level_estimator), + digital_adaptive_disabled(digital_adaptive_disabled) {} ExperimentalAgc(bool enabled, int startup_min_volume) : enabled(enabled), startup_min_volume(startup_min_volume) {} - bool enabled; - int startup_min_volume; + ExperimentalAgc(bool enabled, int startup_min_volume, int clipped_level_min) + : enabled(enabled), + startup_min_volume(startup_min_volume), + clipped_level_min(clipped_level_min) {} + static const ConfigOptionID identifier = ConfigOptionID::kExperimentalAgc; + bool enabled = true; + int startup_min_volume = kAgcStartupMinVolume; + // Lowest microphone level that will be applied in response to clipping. + int clipped_level_min = kClippedLevelMin; + bool enabled_agc2_level_estimator = false; + bool digital_adaptive_disabled = false; }; +// To be deprecated: Please instead use the flag in the +// AudioProcessing::Config::TransientSuppression. +// // Use to enable experimental noise suppression. It can be set in the -// constructor or using AudioProcessing::SetExtraOptions(). +// constructor. +// TODO(webrtc:5298): Remove. struct ExperimentalNs { ExperimentalNs() : enabled(false) {} explicit ExperimentalNs(bool enabled) : enabled(enabled) {} - bool enabled; -}; - -// Use to enable beamforming. Must be provided through the constructor. It will -// have no impact if used with AudioProcessing::SetExtraOptions(). -struct Beamforming { - Beamforming() - : enabled(false), - array_geometry(), - target_direction( - SphericalPointf(static_cast<float>(M_PI) / 2.f, 0.f, 1.f)) {} - Beamforming(bool enabled, const std::vector<Point>& array_geometry) - : Beamforming(enabled, - array_geometry, - SphericalPointf(static_cast<float>(M_PI) / 2.f, 0.f, 1.f)) { - } - Beamforming(bool enabled, - const std::vector<Point>& array_geometry, - SphericalPointf target_direction) - : enabled(enabled), - array_geometry(array_geometry), - target_direction(target_direction) {} - const bool enabled; - const std::vector<Point> array_geometry; - const SphericalPointf target_direction; -}; - -// Use to enable intelligibility enhancer in audio processing. Must be provided -// though the constructor. It will have no impact if used with -// AudioProcessing::SetExtraOptions(). -// -// Note: If enabled and the reverse stream has more than one output channel, -// the reverse stream will become an upmixed mono signal. -struct Intelligibility { - Intelligibility() : enabled(false) {} - explicit Intelligibility(bool enabled) : enabled(enabled) {} + static const ConfigOptionID identifier = ConfigOptionID::kExperimentalNs; bool enabled; }; @@ -149,11 +119,11 @@ struct Intelligibility { // // APM operates on two audio streams on a frame-by-frame basis. Frames of the // primary stream, on which all processing is applied, are passed to -// |ProcessStream()|. Frames of the reverse direction stream, which are used for -// analysis by some components, are passed to |AnalyzeReverseStream()|. On the -// client-side, this will typically be the near-end (capture) and far-end -// (render) streams, respectively. APM should be placed in the signal chain as -// close to the audio hardware abstraction layer (HAL) as possible. +// |ProcessStream()|. Frames of the reverse direction stream are passed to +// |ProcessReverseStream()|. On the client-side, this will typically be the +// near-end (capture) and far-end (render) streams, respectively. APM should be +// placed in the signal chain as close to the audio hardware abstraction layer +// (HAL) as possible. // // On the server-side, the reverse stream will normally not be used, with // processing occurring on each incoming stream. @@ -178,36 +148,43 @@ struct Intelligibility { // data. // // Usage example, omitting error checking: -// AudioProcessing* apm = AudioProcessing::Create(0); +// AudioProcessing* apm = AudioProcessingBuilder().Create(); // -// apm->high_pass_filter()->Enable(true); +// AudioProcessing::Config config; +// config.echo_canceller.enabled = true; +// config.echo_canceller.mobile_mode = false; // -// apm->echo_cancellation()->enable_drift_compensation(false); -// apm->echo_cancellation()->Enable(true); +// config.gain_controller1.enabled = true; +// config.gain_controller1.mode = +// AudioProcessing::Config::GainController1::kAdaptiveAnalog; +// config.gain_controller1.analog_level_minimum = 0; +// config.gain_controller1.analog_level_maximum = 255; // -// apm->noise_reduction()->set_level(kHighSuppression); -// apm->noise_reduction()->Enable(true); +// config.gain_controller2.enabled = true; +// +// config.high_pass_filter.enabled = true; // -// apm->gain_control()->set_analog_level_limits(0, 255); -// apm->gain_control()->set_mode(kAdaptiveAnalog); -// apm->gain_control()->Enable(true); +// config.voice_detection.enabled = true; // -// apm->voice_detection()->Enable(true); +// apm->ApplyConfig(config) +// +// apm->noise_reduction()->set_level(kHighSuppression); +// apm->noise_reduction()->Enable(true); // // // Start a voice call... // // // ... Render frame arrives bound for the audio HAL ... -// apm->AnalyzeReverseStream(render_frame); +// apm->ProcessReverseStream(render_frame); // // // ... Capture frame arrives from the audio HAL ... // // Call required set_stream_ functions. // apm->set_stream_delay_ms(delay_ms); -// apm->gain_control()->set_stream_analog_level(analog_level); +// apm->set_stream_analog_level(analog_level); // // apm->ProcessStream(capture_frame); // // // Call required stream_ functions. -// analog_level = apm->gain_control()->stream_analog_level(); +// analog_level = apm->recommended_stream_analog_level(); // has_voice = apm->stream_has_voice(); // // // Repeate render and capture processing for the duration of the call... @@ -217,31 +194,298 @@ struct Intelligibility { // // Close the application... // delete apm; // -class AudioProcessing { +class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface { public: + // The struct below constitutes the new parameter scheme for the audio + // processing. It is being introduced gradually and until it is fully + // introduced, it is prone to change. + // TODO(peah): Remove this comment once the new config scheme is fully rolled + // out. + // + // The parameters and behavior of the audio processing module are controlled + // by changing the default values in the AudioProcessing::Config struct. + // The config is applied by passing the struct to the ApplyConfig method. + // + // This config is intended to be used during setup, and to enable/disable + // top-level processing effects. Use during processing may cause undesired + // submodule resets, affecting the audio quality. Use the RuntimeSetting + // construct for runtime configuration. + struct RTC_EXPORT Config { + + // Sets the properties of the audio processing pipeline. + struct RTC_EXPORT Pipeline { + Pipeline(); + + // Maximum allowed processing rate used internally. May only be set to + // 32000 or 48000 and any differing values will be treated as 48000. The + // default rate is currently selected based on the CPU architecture, but + // that logic may change. + int maximum_internal_processing_rate; + // Allow multi-channel processing of render audio. + bool multi_channel_render = false; + // Allow multi-channel processing of capture audio when AEC3 is active + // or a custom AEC is injected.. + bool multi_channel_capture = false; + } pipeline; + + // Enabled the pre-amplifier. It amplifies the capture signal + // before any other processing is done. + struct PreAmplifier { + bool enabled = false; + float fixed_gain_factor = 1.f; + } pre_amplifier; + + struct HighPassFilter { + bool enabled = false; + bool apply_in_full_band = true; + } high_pass_filter; + + struct EchoCanceller { + bool enabled = false; + bool mobile_mode = false; + bool export_linear_aec_output = false; + // Enforce the highpass filter to be on (has no effect for the mobile + // mode). + bool enforce_high_pass_filtering = true; + } echo_canceller; + + // Enables background noise suppression. + struct NoiseSuppression { + bool enabled = false; + enum Level { kLow, kModerate, kHigh, kVeryHigh }; + Level level = kModerate; + bool analyze_linear_aec_output_when_available = false; + } noise_suppression; + + // Enables transient suppression. + struct TransientSuppression { + bool enabled = false; + } transient_suppression; + + // Enables reporting of |voice_detected| in webrtc::AudioProcessingStats. + struct VoiceDetection { + bool enabled = false; + } voice_detection; + + // Enables automatic gain control (AGC) functionality. + // The automatic gain control (AGC) component brings the signal to an + // appropriate range. This is done by applying a digital gain directly and, + // in the analog mode, prescribing an analog gain to be applied at the audio + // HAL. + // Recommended to be enabled on the client-side. + struct GainController1 { + bool enabled = false; + enum Mode { + // Adaptive mode intended for use if an analog volume control is + // available on the capture device. It will require the user to provide + // coupling between the OS mixer controls and AGC through the + // stream_analog_level() functions. + // It consists of an analog gain prescription for the audio device and a + // digital compression stage. + kAdaptiveAnalog, + // Adaptive mode intended for situations in which an analog volume + // control is unavailable. It operates in a similar fashion to the + // adaptive analog mode, but with scaling instead applied in the digital + // domain. As with the analog mode, it additionally uses a digital + // compression stage. + kAdaptiveDigital, + // Fixed mode which enables only the digital compression stage also used + // by the two adaptive modes. + // It is distinguished from the adaptive modes by considering only a + // short time-window of the input signal. It applies a fixed gain + // through most of the input level range, and compresses (gradually + // reduces gain with increasing level) the input signal at higher + // levels. This mode is preferred on embedded devices where the capture + // signal level is predictable, so that a known gain can be applied. + kFixedDigital + }; + Mode mode = kAdaptiveAnalog; + // Sets the target peak level (or envelope) of the AGC in dBFs (decibels + // from digital full-scale). The convention is to use positive values. For + // instance, passing in a value of 3 corresponds to -3 dBFs, or a target + // level 3 dB below full-scale. Limited to [0, 31]. + int target_level_dbfs = 3; + // Sets the maximum gain the digital compression stage may apply, in dB. A + // higher number corresponds to greater compression, while a value of 0 + // will leave the signal uncompressed. Limited to [0, 90]. + // For updates after APM setup, use a RuntimeSetting instead. + int compression_gain_db = 9; + // When enabled, the compression stage will hard limit the signal to the + // target level. Otherwise, the signal will be compressed but not limited + // above the target level. + bool enable_limiter = true; + // Sets the minimum and maximum analog levels of the audio capture device. + // Must be set if an analog mode is used. Limited to [0, 65535]. + int analog_level_minimum = 0; + int analog_level_maximum = 255; + + // Enables the analog gain controller functionality. + struct AnalogGainController { + bool enabled = true; + int startup_min_volume = kAgcStartupMinVolume; + // Lowest analog microphone level that will be applied in response to + // clipping. + int clipped_level_min = kClippedLevelMin; + bool enable_agc2_level_estimator = false; + bool enable_digital_adaptive = true; + } analog_gain_controller; + } gain_controller1; + + // Enables the next generation AGC functionality. This feature replaces the + // standard methods of gain control in the previous AGC. Enabling this + // submodule enables an adaptive digital AGC followed by a limiter. By + // setting |fixed_gain_db|, the limiter can be turned into a compressor that + // first applies a fixed gain. The adaptive digital AGC can be turned off by + // setting |adaptive_digital_mode=false|. + struct GainController2 { + enum LevelEstimator { kRms, kPeak }; + bool enabled = false; + struct { + float gain_db = 0.f; + } fixed_digital; + struct { + bool enabled = false; + float vad_probability_attack = 1.f; + LevelEstimator level_estimator = kRms; + int level_estimator_adjacent_speech_frames_threshold = 1; + // TODO(crbug.com/webrtc/7494): Remove `use_saturation_protector`. + bool use_saturation_protector = true; + float initial_saturation_margin_db = 20.f; + float extra_saturation_margin_db = 2.f; + int gain_applier_adjacent_speech_frames_threshold = 1; + float max_gain_change_db_per_second = 3.f; + float max_output_noise_level_dbfs = -50.f; + } adaptive_digital; + } gain_controller2; + + struct ResidualEchoDetector { + bool enabled = true; + } residual_echo_detector; + + // Enables reporting of |output_rms_dbfs| in webrtc::AudioProcessingStats. + struct LevelEstimation { + bool enabled = false; + } level_estimation; + + std::string ToString() const; + }; + // TODO(mgraczyk): Remove once all methods that use ChannelLayout are gone. enum ChannelLayout { kMono, // Left, right. kStereo, - // Mono, keyboard mic. + // Mono, keyboard, and mic. kMonoAndKeyboard, - // Left, right, keyboard mic. + // Left, right, keyboard, and mic. kStereoAndKeyboard }; - // Creates an APM instance. Use one instance for every primary audio stream - // requiring processing. On the client-side, this would typically be one - // instance for the near-end stream, and additional instances for each far-end - // stream which requires processing. On the server-side, this would typically - // be one instance for every incoming stream. - static AudioProcessing* Create(); - // Allows passing in an optional configuration at create-time. - static AudioProcessing* Create(const Config& config); - // Only for testing. - static AudioProcessing* Create(const Config& config, - Beamformer<float>* beamformer); - virtual ~AudioProcessing() {} + // Specifies the properties of a setting to be passed to AudioProcessing at + // runtime. + class RuntimeSetting { + public: + enum class Type { + kNotSpecified, + kCapturePreGain, + kCaptureCompressionGain, + kCaptureFixedPostGain, + kPlayoutVolumeChange, + kCustomRenderProcessingRuntimeSetting, + kPlayoutAudioDeviceChange, + kCaptureOutputUsed + }; + + // Play-out audio device properties. + struct PlayoutAudioDeviceInfo { + int id; // Identifies the audio device. + int max_volume; // Maximum play-out volume. + }; + + RuntimeSetting() : type_(Type::kNotSpecified), value_(0.f) {} + ~RuntimeSetting() = default; + + static RuntimeSetting CreateCapturePreGain(float gain) { + RTC_DCHECK_GE(gain, 1.f) << "Attenuation is not allowed."; + return {Type::kCapturePreGain, gain}; + } + + // Corresponds to Config::GainController1::compression_gain_db, but for + // runtime configuration. + static RuntimeSetting CreateCompressionGainDb(int gain_db) { + RTC_DCHECK_GE(gain_db, 0); + RTC_DCHECK_LE(gain_db, 90); + return {Type::kCaptureCompressionGain, static_cast<float>(gain_db)}; + } + + // Corresponds to Config::GainController2::fixed_digital::gain_db, but for + // runtime configuration. + static RuntimeSetting CreateCaptureFixedPostGain(float gain_db) { + RTC_DCHECK_GE(gain_db, 0.f); + RTC_DCHECK_LE(gain_db, 90.f); + return {Type::kCaptureFixedPostGain, gain_db}; + } + + // Creates a runtime setting to notify play-out (aka render) audio device + // changes. + static RuntimeSetting CreatePlayoutAudioDeviceChange( + PlayoutAudioDeviceInfo audio_device) { + return {Type::kPlayoutAudioDeviceChange, audio_device}; + } + + // Creates a runtime setting to notify play-out (aka render) volume changes. + // |volume| is the unnormalized volume, the maximum of which + static RuntimeSetting CreatePlayoutVolumeChange(int volume) { + return {Type::kPlayoutVolumeChange, volume}; + } + + static RuntimeSetting CreateCustomRenderSetting(float payload) { + return {Type::kCustomRenderProcessingRuntimeSetting, payload}; + } + + static RuntimeSetting CreateCaptureOutputUsedSetting(bool payload) { + return {Type::kCaptureOutputUsed, payload}; + } + + Type type() const { return type_; } + // Getters do not return a value but instead modify the argument to protect + // from implicit casting. + void GetFloat(float* value) const { + RTC_DCHECK(value); + *value = value_.float_value; + } + void GetInt(int* value) const { + RTC_DCHECK(value); + *value = value_.int_value; + } + void GetBool(bool* value) const { + RTC_DCHECK(value); + *value = value_.bool_value; + } + void GetPlayoutAudioDeviceInfo(PlayoutAudioDeviceInfo* value) const { + RTC_DCHECK(value); + *value = value_.playout_audio_device_info; + } + + private: + RuntimeSetting(Type id, float value) : type_(id), value_(value) {} + RuntimeSetting(Type id, int value) : type_(id), value_(value) {} + RuntimeSetting(Type id, PlayoutAudioDeviceInfo value) + : type_(id), value_(value) {} + Type type_; + union U { + U() {} + U(int value) : int_value(value) {} + U(float value) : float_value(value) {} + U(PlayoutAudioDeviceInfo value) : playout_audio_device_info(value) {} + float float_value; + int int_value; + bool bool_value; + PlayoutAudioDeviceInfo playout_audio_device_info; + } value_; + }; + + ~AudioProcessing() override {} // Initializes internal states, while retaining all user settings. This // should be called before beginning to process a new audio stream. However, @@ -250,8 +494,9 @@ class AudioProcessing { // // It is also not necessary to call if the audio parameters (sample // rate and number of channels) have changed. Passing updated parameters - // directly to |ProcessStream()| and |AnalyzeReverseStream()| is permissible. + // directly to |ProcessStream()| and |ProcessReverseStream()| is permissible. // If the parameters are known at init-time though, they may be provided. + // TODO(webrtc:5298): Change to return void. virtual int Initialize() = 0; // The int16 interfaces require: @@ -268,24 +513,25 @@ class AudioProcessing { // Initialize with unpacked parameters. See Initialize() above for details. // // TODO(mgraczyk): Remove once clients are updated to use the new interface. - virtual int Initialize(int input_sample_rate_hz, - int output_sample_rate_hz, - int reverse_sample_rate_hz, - ChannelLayout input_layout, - ChannelLayout output_layout, - ChannelLayout reverse_layout) = 0; + virtual int Initialize(int capture_input_sample_rate_hz, + int capture_output_sample_rate_hz, + int render_sample_rate_hz, + ChannelLayout capture_input_layout, + ChannelLayout capture_output_layout, + ChannelLayout render_input_layout) = 0; - // Pass down additional options which don't have explicit setters. This - // ensures the options are applied immediately. - virtual void SetExtraOptions(const Config& config) = 0; + // TODO(peah): This method is a temporary solution used to take control + // over the parameters in the audio processing module and is likely to change. + virtual void ApplyConfig(const Config& config) = 0; // TODO(ajm): Only intended for internal use. Make private and friend the // necessary classes? virtual int proc_sample_rate_hz() const = 0; virtual int proc_split_sample_rate_hz() const = 0; - virtual int num_input_channels() const = 0; - virtual int num_output_channels() const = 0; - virtual int num_reverse_channels() const = 0; + virtual size_t num_input_channels() const = 0; + virtual size_t num_proc_channels() const = 0; + virtual size_t num_output_channels() const = 0; + virtual size_t num_reverse_channels() const = 0; // Set to true when the output of AudioProcessing will be muted or in some // other way not used. Ideally, the captured audio would still be processed, @@ -293,34 +539,16 @@ class AudioProcessing { // Default false. virtual void set_output_will_be_muted(bool muted) = 0; - // Processes a 10 ms |frame| of the primary audio stream. On the client-side, - // this is the near-end (or captured) audio. - // - // If needed for enabled functionality, any function with the set_stream_ tag - // must be called prior to processing the current frame. Any getter function - // with the stream_ tag which is needed should be called after processing. - // - // The |sample_rate_hz_|, |num_channels_|, and |samples_per_channel_| - // members of |frame| must be valid. If changed from the previous call to this - // method, it will trigger an initialization. - virtual int ProcessStream(AudioFrame* frame) = 0; + // Enqueue a runtime setting. + virtual void SetRuntimeSetting(RuntimeSetting setting) = 0; - // Accepts deinterleaved float audio with the range [-1, 1]. Each element - // of |src| points to a channel buffer, arranged according to - // |input_layout|. At output, the channels will be arranged according to - // |output_layout| at |output_sample_rate_hz| in |dest|. - // - // The output layout must have one channel or as many channels as the input. - // |src| and |dest| may use the same memory, if desired. - // - // TODO(mgraczyk): Remove once clients are updated to use the new interface. - virtual int ProcessStream(const float* const* src, - size_t samples_per_channel, - int input_sample_rate_hz, - ChannelLayout input_layout, - int output_sample_rate_hz, - ChannelLayout output_layout, - float* const* dest) = 0; + // Accepts and produces a 10 ms frame interleaved 16 bit integer audio as + // specified in |input_config| and |output_config|. |src| and |dest| may use + // the same memory, if desired. + virtual int ProcessStream(const int16_t* const src, + const StreamConfig& input_config, + const StreamConfig& output_config, + int16_t* const dest) = 0; // Accepts deinterleaved float audio with the range [-1, 1]. Each element of // |src| points to a channel buffer, arranged according to |input_stream|. At @@ -334,116 +562,106 @@ class AudioProcessing { const StreamConfig& output_config, float* const* dest) = 0; - // Analyzes a 10 ms |frame| of the reverse direction audio stream. The frame - // will not be modified. On the client-side, this is the far-end (or to be - // rendered) audio. - // - // It is only necessary to provide this if echo processing is enabled, as the - // reverse stream forms the echo reference signal. It is recommended, but not - // necessary, to provide if gain control is enabled. On the server-side this - // typically will not be used. If you're not sure what to pass in here, - // chances are you don't need to use it. - // - // The |sample_rate_hz_|, |num_channels_|, and |samples_per_channel_| - // members of |frame| must be valid. |sample_rate_hz_| must correspond to - // |input_sample_rate_hz()| - // - // TODO(ajm): add const to input; requires an implementation fix. - // DEPRECATED: Use |ProcessReverseStream| instead. - // TODO(ekm): Remove once all users have updated to |ProcessReverseStream|. - virtual int AnalyzeReverseStream(AudioFrame* frame) = 0; - - // Same as |AnalyzeReverseStream|, but may modify |frame| if intelligibility - // is enabled. - virtual int ProcessReverseStream(AudioFrame* frame) = 0; - - // Accepts deinterleaved float audio with the range [-1, 1]. Each element - // of |data| points to a channel buffer, arranged according to |layout|. - // TODO(mgraczyk): Remove once clients are updated to use the new interface. - virtual int AnalyzeReverseStream(const float* const* data, - size_t samples_per_channel, - int rev_sample_rate_hz, - ChannelLayout layout) = 0; + // Accepts and produces a 10 ms frame of interleaved 16 bit integer audio for + // the reverse direction audio stream as specified in |input_config| and + // |output_config|. |src| and |dest| may use the same memory, if desired. + virtual int ProcessReverseStream(const int16_t* const src, + const StreamConfig& input_config, + const StreamConfig& output_config, + int16_t* const dest) = 0; // Accepts deinterleaved float audio with the range [-1, 1]. Each element of // |data| points to a channel buffer, arranged according to |reverse_config|. virtual int ProcessReverseStream(const float* const* src, - const StreamConfig& reverse_input_config, - const StreamConfig& reverse_output_config, + const StreamConfig& input_config, + const StreamConfig& output_config, float* const* dest) = 0; + // Accepts deinterleaved float audio with the range [-1, 1]. Each element + // of |data| points to a channel buffer, arranged according to + // |reverse_config|. + virtual int AnalyzeReverseStream(const float* const* data, + const StreamConfig& reverse_config) = 0; + + // Returns the most recently produced 10 ms of the linear AEC output at a rate + // of 16 kHz. If there is more than one capture channel, a mono representation + // of the input is returned. Returns true/false to indicate whether an output + // returned. + virtual bool GetLinearAecOutput( + rtc::ArrayView<std::array<float, 160>> linear_output) const = 0; + + // This must be called prior to ProcessStream() if and only if adaptive analog + // gain control is enabled, to pass the current analog level from the audio + // HAL. Must be within the range provided in Config::GainController1. + virtual void set_stream_analog_level(int level) = 0; + + // When an analog mode is set, this should be called after ProcessStream() + // to obtain the recommended new analog level for the audio HAL. It is the + // user's responsibility to apply this level. + virtual int recommended_stream_analog_level() const = 0; + // This must be called if and only if echo processing is enabled. // - // Sets the |delay| in ms between AnalyzeReverseStream() receiving a far-end + // Sets the |delay| in ms between ProcessReverseStream() receiving a far-end // frame and ProcessStream() receiving a near-end frame containing the // corresponding echo. On the client-side this can be expressed as // delay = (t_render - t_analyze) + (t_process - t_capture) // where, - // - t_analyze is the time a frame is passed to AnalyzeReverseStream() and + // - t_analyze is the time a frame is passed to ProcessReverseStream() and // t_render is the time the first sample of the same frame is rendered by // the audio hardware. // - t_capture is the time the first sample of a frame is captured by the - // audio hardware and t_pull is the time the same frame is passed to + // audio hardware and t_process is the time the same frame is passed to // ProcessStream(). virtual int set_stream_delay_ms(int delay) = 0; virtual int stream_delay_ms() const = 0; - virtual bool was_stream_delay_set() const = 0; // Call to signal that a key press occurred (true) or did not occur (false) // with this chunk of audio. virtual void set_stream_key_pressed(bool key_pressed) = 0; - // Sets a delay |offset| in ms to add to the values passed in through - // set_stream_delay_ms(). May be positive or negative. - // - // Note that this could cause an otherwise valid value passed to - // set_stream_delay_ms() to return an error. - virtual void set_delay_offset_ms(int offset) = 0; - virtual int delay_offset_ms() const = 0; - - // Starts recording debugging information to a file specified by |filename|, - // a NULL-terminated string. If there is an ongoing recording, the old file - // will be closed, and recording will continue in the newly specified file. - // An already existing file will be overwritten without warning. - static const size_t kMaxFilenameSize = 1024; - virtual int StartDebugRecording(const char filename[kMaxFilenameSize]) = 0; - - // Same as above but uses an existing file handle. Takes ownership - // of |handle| and closes it at StopDebugRecording(). - virtual int StartDebugRecording(FILE* handle) = 0; - - // Same as above but uses an existing PlatformFile handle. Takes ownership - // of |handle| and closes it at StopDebugRecording(). - // TODO(xians): Make this interface pure virtual. - virtual int StartDebugRecordingForPlatformFile(rtc::PlatformFile handle) { - return -1; - } - - // Stops recording debugging information, and closes the file. Recording - // cannot be resumed in the same file (without overwriting it). - virtual int StopDebugRecording() = 0; - - // Use to send UMA histograms at end of a call. Note that all histogram - // specific member variables are reset. - virtual void UpdateHistogramsOnCallEnd() = 0; - - // These provide access to the component interfaces and should never return - // NULL. The pointers will be valid for the lifetime of the APM instance. - // The memory for these objects is entirely managed internally. - virtual EchoCancellation* echo_cancellation() const = 0; - virtual EchoControlMobile* echo_control_mobile() const = 0; - virtual GainControl* gain_control() const = 0; - virtual HighPassFilter* high_pass_filter() const = 0; - virtual LevelEstimator* level_estimator() const = 0; - virtual NoiseSuppression* noise_suppression() const = 0; - virtual VoiceDetection* voice_detection() const = 0; - - struct Statistic { - int instant; // Instantaneous value. - int average; // Long-term average. - int maximum; // Long-term maximum. - int minimum; // Long-term minimum. - }; + // Creates and attaches an webrtc::AecDump for recording debugging + // information. + // The |worker_queue| may not be null and must outlive the created + // AecDump instance. |max_log_size_bytes == -1| means the log size + // will be unlimited. |handle| may not be null. The AecDump takes + // responsibility for |handle| and closes it in the destructor. A + // return value of true indicates that the file has been + // sucessfully opened, while a value of false indicates that + // opening the file failed. + virtual bool CreateAndAttachAecDump(const std::string& file_name, + int64_t max_log_size_bytes, + rtc::TaskQueue* worker_queue) = 0; + virtual bool CreateAndAttachAecDump(FILE* handle, + int64_t max_log_size_bytes, + rtc::TaskQueue* worker_queue) = 0; + + // TODO(webrtc:5298) Deprecated variant. + // Attaches provided webrtc::AecDump for recording debugging + // information. Log file and maximum file size logic is supposed to + // be handled by implementing instance of AecDump. Calling this + // method when another AecDump is attached resets the active AecDump + // with a new one. This causes the d-tor of the earlier AecDump to + // be called. The d-tor call may block until all pending logging + // tasks are completed. + virtual void AttachAecDump(std::unique_ptr<AecDump> aec_dump) = 0; + + // If no AecDump is attached, this has no effect. If an AecDump is + // attached, it's destructor is called. The d-tor may block until + // all pending logging tasks are completed. + virtual void DetachAecDump() = 0; + + // Get audio processing statistics. + virtual AudioProcessingStats GetStatistics() = 0; + // TODO(webrtc:5298) Deprecated variant. The |has_remote_tracks| argument + // should be set if there are active remote tracks (this would usually be true + // during a call). If there are no remote tracks some of the stats will not be + // set by AudioProcessing, because they only make sense if there is at least + // one remote track. + virtual AudioProcessingStats GetStatistics(bool has_remote_tracks) = 0; + + // Returns the last applied configuration. + virtual AudioProcessing::Config GetConfig() const = 0; enum Error { // Fatal errors. @@ -467,6 +685,7 @@ class AudioProcessing { kBadStreamParameterWarning = -13 }; + // Native rates supported by the integer interfaces. enum NativeRate { kSampleRate8kHz = 8000, kSampleRate16kHz = 16000, @@ -474,14 +693,67 @@ class AudioProcessing { kSampleRate48kHz = 48000 }; - static const int kNativeSampleRatesHz[]; - static const size_t kNumNativeSampleRates; - static const int kMaxNativeSampleRateHz; - static const int kMaxAECMSampleRateHz; + // TODO(kwiberg): We currently need to support a compiler (Visual C++) that + // complains if we don't explicitly state the size of the array here. Remove + // the size when that's no longer the case. + static constexpr int kNativeSampleRatesHz[4] = { + kSampleRate8kHz, kSampleRate16kHz, kSampleRate32kHz, kSampleRate48kHz}; + static constexpr size_t kNumNativeSampleRates = + arraysize(kNativeSampleRatesHz); + static constexpr int kMaxNativeSampleRateHz = + kNativeSampleRatesHz[kNumNativeSampleRates - 1]; static const int kChunkSizeMs = 10; }; +class RTC_EXPORT AudioProcessingBuilder { + public: + AudioProcessingBuilder(); + ~AudioProcessingBuilder(); + // The AudioProcessingBuilder takes ownership of the echo_control_factory. + AudioProcessingBuilder& SetEchoControlFactory( + std::unique_ptr<EchoControlFactory> echo_control_factory) { + echo_control_factory_ = std::move(echo_control_factory); + return *this; + } + // The AudioProcessingBuilder takes ownership of the capture_post_processing. + AudioProcessingBuilder& SetCapturePostProcessing( + std::unique_ptr<CustomProcessing> capture_post_processing) { + capture_post_processing_ = std::move(capture_post_processing); + return *this; + } + // The AudioProcessingBuilder takes ownership of the render_pre_processing. + AudioProcessingBuilder& SetRenderPreProcessing( + std::unique_ptr<CustomProcessing> render_pre_processing) { + render_pre_processing_ = std::move(render_pre_processing); + return *this; + } + // The AudioProcessingBuilder takes ownership of the echo_detector. + AudioProcessingBuilder& SetEchoDetector( + rtc::scoped_refptr<EchoDetector> echo_detector) { + echo_detector_ = std::move(echo_detector); + return *this; + } + // The AudioProcessingBuilder takes ownership of the capture_analyzer. + AudioProcessingBuilder& SetCaptureAnalyzer( + std::unique_ptr<CustomAudioAnalyzer> capture_analyzer) { + capture_analyzer_ = std::move(capture_analyzer); + return *this; + } + // This creates an APM instance using the previously set components. Calling + // the Create function resets the AudioProcessingBuilder to its initial state. + AudioProcessing* Create(); + AudioProcessing* Create(const webrtc::Config& config); + + private: + std::unique_ptr<EchoControlFactory> echo_control_factory_; + std::unique_ptr<CustomProcessing> capture_post_processing_; + std::unique_ptr<CustomProcessing> render_pre_processing_; + rtc::scoped_refptr<EchoDetector> echo_detector_; + std::unique_ptr<CustomAudioAnalyzer> capture_analyzer_; + RTC_DISALLOW_COPY_AND_ASSIGN(AudioProcessingBuilder); +}; + class StreamConfig { public: // sample_rate_hz: The sampling rate of the stream. @@ -497,7 +769,7 @@ class StreamConfig { // is true, the last channel in any corresponding list of // channels is the keyboard channel. StreamConfig(int sample_rate_hz = 0, - int num_channels = 0, + size_t num_channels = 0, bool has_keyboard = false) : sample_rate_hz_(sample_rate_hz), num_channels_(num_channels), @@ -508,14 +780,14 @@ class StreamConfig { sample_rate_hz_ = value; num_frames_ = calculate_frames(value); } - void set_num_channels(int value) { num_channels_ = value; } + void set_num_channels(size_t value) { num_channels_ = value; } void set_has_keyboard(bool value) { has_keyboard_ = value; } int sample_rate_hz() const { return sample_rate_hz_; } // The number of channels in the stream, not including the keyboard channel if // present. - int num_channels() const { return num_channels_; } + size_t num_channels() const { return num_channels_; } bool has_keyboard() const { return has_keyboard_; } size_t num_frames() const { return num_frames_; } @@ -531,12 +803,12 @@ class StreamConfig { private: static size_t calculate_frames(int sample_rate_hz) { - return static_cast<size_t>( - AudioProcessing::kChunkSizeMs * sample_rate_hz / 1000); + return static_cast<size_t>(AudioProcessing::kChunkSizeMs * sample_rate_hz / + 1000); } int sample_rate_hz_; - int num_channels_; + size_t num_channels_; bool has_keyboard_; size_t num_frames_; }; @@ -589,365 +861,64 @@ class ProcessingConfig { StreamConfig streams[StreamName::kNumStreamNames]; }; -// The acoustic echo cancellation (AEC) component provides better performance -// than AECM but also requires more processing power and is dependent on delay -// stability and reporting accuracy. As such it is well-suited and recommended -// for PC and IP phone applications. -// -// Not recommended to be enabled on the server-side. -class EchoCancellation { - public: - // EchoCancellation and EchoControlMobile may not be enabled simultaneously. - // Enabling one will disable the other. - virtual int Enable(bool enable) = 0; - virtual bool is_enabled() const = 0; - - // Differences in clock speed on the primary and reverse streams can impact - // the AEC performance. On the client-side, this could be seen when different - // render and capture devices are used, particularly with webcams. - // - // This enables a compensation mechanism, and requires that - // set_stream_drift_samples() be called. - virtual int enable_drift_compensation(bool enable) = 0; - virtual bool is_drift_compensation_enabled() const = 0; - - // Sets the difference between the number of samples rendered and captured by - // the audio devices since the last call to |ProcessStream()|. Must be called - // if drift compensation is enabled, prior to |ProcessStream()|. - virtual void set_stream_drift_samples(int drift) = 0; - virtual int stream_drift_samples() const = 0; - - enum SuppressionLevel { - kLowSuppression, - kModerateSuppression, - kHighSuppression - }; - - // Sets the aggressiveness of the suppressor. A higher level trades off - // double-talk performance for increased echo suppression. - virtual int set_suppression_level(SuppressionLevel level) = 0; - virtual SuppressionLevel suppression_level() const = 0; - - // Returns false if the current frame almost certainly contains no echo - // and true if it _might_ contain echo. - virtual bool stream_has_echo() const = 0; - - // Enables the computation of various echo metrics. These are obtained - // through |GetMetrics()|. - virtual int enable_metrics(bool enable) = 0; - virtual bool are_metrics_enabled() const = 0; - - // Each statistic is reported in dB. - // P_far: Far-end (render) signal power. - // P_echo: Near-end (capture) echo signal power. - // P_out: Signal power at the output of the AEC. - // P_a: Internal signal power at the point before the AEC's non-linear - // processor. - struct Metrics { - // RERL = ERL + ERLE - AudioProcessing::Statistic residual_echo_return_loss; - - // ERL = 10log_10(P_far / P_echo) - AudioProcessing::Statistic echo_return_loss; - - // ERLE = 10log_10(P_echo / P_out) - AudioProcessing::Statistic echo_return_loss_enhancement; - - // (Pre non-linear processing suppression) A_NLP = 10log_10(P_echo / P_a) - AudioProcessing::Statistic a_nlp; - }; - - // TODO(ajm): discuss the metrics update period. - virtual int GetMetrics(Metrics* metrics) = 0; - - // Enables computation and logging of delay values. Statistics are obtained - // through |GetDelayMetrics()|. - virtual int enable_delay_logging(bool enable) = 0; - virtual bool is_delay_logging_enabled() const = 0; - - // The delay metrics consists of the delay |median| and the delay standard - // deviation |std|. It also consists of the fraction of delay estimates - // |fraction_poor_delays| that can make the echo cancellation perform poorly. - // The values are aggregated until the first call to |GetDelayMetrics()| and - // afterwards aggregated and updated every second. - // Note that if there are several clients pulling metrics from - // |GetDelayMetrics()| during a session the first call from any of them will - // change to one second aggregation window for all. - // TODO(bjornv): Deprecated, remove. - virtual int GetDelayMetrics(int* median, int* std) = 0; - virtual int GetDelayMetrics(int* median, int* std, - float* fraction_poor_delays) = 0; - - // Returns a pointer to the low level AEC component. In case of multiple - // channels, the pointer to the first one is returned. A NULL pointer is - // returned when the AEC component is disabled or has not been initialized - // successfully. - virtual struct AecCore* aec_core() const = 0; - - protected: - virtual ~EchoCancellation() {} -}; - -// The acoustic echo control for mobile (AECM) component is a low complexity -// robust option intended for use on mobile devices. -// -// Not recommended to be enabled on the server-side. -class EchoControlMobile { +// Experimental interface for a custom analysis submodule. +class CustomAudioAnalyzer { public: - // EchoCancellation and EchoControlMobile may not be enabled simultaneously. - // Enabling one will disable the other. - virtual int Enable(bool enable) = 0; - virtual bool is_enabled() const = 0; - - // Recommended settings for particular audio routes. In general, the louder - // the echo is expected to be, the higher this value should be set. The - // preferred setting may vary from device to device. - enum RoutingMode { - kQuietEarpieceOrHeadset, - kEarpiece, - kLoudEarpiece, - kSpeakerphone, - kLoudSpeakerphone - }; - - // Sets echo control appropriate for the audio routing |mode| on the device. - // It can and should be updated during a call if the audio routing changes. - virtual int set_routing_mode(RoutingMode mode) = 0; - virtual RoutingMode routing_mode() const = 0; - - // Comfort noise replaces suppressed background noise to maintain a - // consistent signal level. - virtual int enable_comfort_noise(bool enable) = 0; - virtual bool is_comfort_noise_enabled() const = 0; - - // A typical use case is to initialize the component with an echo path from a - // previous call. The echo path is retrieved using |GetEchoPath()|, typically - // at the end of a call. The data can then be stored for later use as an - // initializer before the next call, using |SetEchoPath()|. - // - // Controlling the echo path this way requires the data |size_bytes| to match - // the internal echo path size. This size can be acquired using - // |echo_path_size_bytes()|. |SetEchoPath()| causes an entire reset, worth - // noting if it is to be called during an ongoing call. - // - // It is possible that version incompatibilities may result in a stored echo - // path of the incorrect size. In this case, the stored path should be - // discarded. - virtual int SetEchoPath(const void* echo_path, size_t size_bytes) = 0; - virtual int GetEchoPath(void* echo_path, size_t size_bytes) const = 0; - - // The returned path size is guaranteed not to change for the lifetime of - // the application. - static size_t echo_path_size_bytes(); - - protected: - virtual ~EchoControlMobile() {} + // (Re-) Initializes the submodule. + virtual void Initialize(int sample_rate_hz, int num_channels) = 0; + // Analyzes the given capture or render signal. + virtual void Analyze(const AudioBuffer* audio) = 0; + // Returns a string representation of the module state. + virtual std::string ToString() const = 0; + + virtual ~CustomAudioAnalyzer() {} }; -// The automatic gain control (AGC) component brings the signal to an -// appropriate range. This is done by applying a digital gain directly and, in -// the analog mode, prescribing an analog gain to be applied at the audio HAL. -// -// Recommended to be enabled on the client-side. -class GainControl { +// Interface for a custom processing submodule. +class CustomProcessing { public: - virtual int Enable(bool enable) = 0; - virtual bool is_enabled() const = 0; - - // When an analog mode is set, this must be called prior to |ProcessStream()| - // to pass the current analog level from the audio HAL. Must be within the - // range provided to |set_analog_level_limits()|. - virtual int set_stream_analog_level(int level) = 0; - - // When an analog mode is set, this should be called after |ProcessStream()| - // to obtain the recommended new analog level for the audio HAL. It is the - // users responsibility to apply this level. - virtual int stream_analog_level() = 0; - - enum Mode { - // Adaptive mode intended for use if an analog volume control is available - // on the capture device. It will require the user to provide coupling - // between the OS mixer controls and AGC through the |stream_analog_level()| - // functions. - // - // It consists of an analog gain prescription for the audio device and a - // digital compression stage. - kAdaptiveAnalog, - - // Adaptive mode intended for situations in which an analog volume control - // is unavailable. It operates in a similar fashion to the adaptive analog - // mode, but with scaling instead applied in the digital domain. As with - // the analog mode, it additionally uses a digital compression stage. - kAdaptiveDigital, - - // Fixed mode which enables only the digital compression stage also used by - // the two adaptive modes. - // - // It is distinguished from the adaptive modes by considering only a - // short time-window of the input signal. It applies a fixed gain through - // most of the input level range, and compresses (gradually reduces gain - // with increasing level) the input signal at higher levels. This mode is - // preferred on embedded devices where the capture signal level is - // predictable, so that a known gain can be applied. - kFixedDigital - }; - - virtual int set_mode(Mode mode) = 0; - virtual Mode mode() const = 0; - - // Sets the target peak |level| (or envelope) of the AGC in dBFs (decibels - // from digital full-scale). The convention is to use positive values. For - // instance, passing in a value of 3 corresponds to -3 dBFs, or a target - // level 3 dB below full-scale. Limited to [0, 31]. - // - // TODO(ajm): use a negative value here instead, if/when VoE will similarly - // update its interface. - virtual int set_target_level_dbfs(int level) = 0; - virtual int target_level_dbfs() const = 0; - - // Sets the maximum |gain| the digital compression stage may apply, in dB. A - // higher number corresponds to greater compression, while a value of 0 will - // leave the signal uncompressed. Limited to [0, 90]. - virtual int set_compression_gain_db(int gain) = 0; - virtual int compression_gain_db() const = 0; - - // When enabled, the compression stage will hard limit the signal to the - // target level. Otherwise, the signal will be compressed but not limited - // above the target level. - virtual int enable_limiter(bool enable) = 0; - virtual bool is_limiter_enabled() const = 0; - - // Sets the |minimum| and |maximum| analog levels of the audio capture device. - // Must be set if and only if an analog mode is used. Limited to [0, 65535]. - virtual int set_analog_level_limits(int minimum, - int maximum) = 0; - virtual int analog_level_minimum() const = 0; - virtual int analog_level_maximum() const = 0; - - // Returns true if the AGC has detected a saturation event (period where the - // signal reaches digital full-scale) in the current frame and the analog - // level cannot be reduced. - // - // This could be used as an indicator to reduce or disable analog mic gain at - // the audio HAL. - virtual bool stream_is_saturated() const = 0; - - protected: - virtual ~GainControl() {} + // (Re-)Initializes the submodule. + virtual void Initialize(int sample_rate_hz, int num_channels) = 0; + // Processes the given capture or render signal. + virtual void Process(AudioBuffer* audio) = 0; + // Returns a string representation of the module state. + virtual std::string ToString() const = 0; + // Handles RuntimeSettings. TODO(webrtc:9262): make pure virtual + // after updating dependencies. + virtual void SetRuntimeSetting(AudioProcessing::RuntimeSetting setting); + + virtual ~CustomProcessing() {} }; -// A filtering component which removes DC offset and low-frequency noise. -// Recommended to be enabled on the client-side. -class HighPassFilter { +// Interface for an echo detector submodule. +class EchoDetector : public rtc::RefCountInterface { public: - virtual int Enable(bool enable) = 0; - virtual bool is_enabled() const = 0; + // (Re-)Initializes the submodule. + virtual void Initialize(int capture_sample_rate_hz, + int num_capture_channels, + int render_sample_rate_hz, + int num_render_channels) = 0; - protected: - virtual ~HighPassFilter() {} -}; - -// An estimation component used to retrieve level metrics. -class LevelEstimator { - public: - virtual int Enable(bool enable) = 0; - virtual bool is_enabled() const = 0; + // Analysis (not changing) of the render signal. + virtual void AnalyzeRenderAudio(rtc::ArrayView<const float> render_audio) = 0; - // Returns the root mean square (RMS) level in dBFs (decibels from digital - // full-scale), or alternately dBov. It is computed over all primary stream - // frames since the last call to RMS(). The returned value is positive but - // should be interpreted as negative. It is constrained to [0, 127]. - // - // The computation follows: https://tools.ietf.org/html/rfc6465 - // with the intent that it can provide the RTP audio level indication. - // - // Frames passed to ProcessStream() with an |_energy| of zero are considered - // to have been muted. The RMS of the frame will be interpreted as -127. - virtual int RMS() = 0; + // Analysis (not changing) of the capture signal. + virtual void AnalyzeCaptureAudio( + rtc::ArrayView<const float> capture_audio) = 0; - protected: - virtual ~LevelEstimator() {} -}; + // Pack an AudioBuffer into a vector<float>. + static void PackRenderAudioBuffer(AudioBuffer* audio, + std::vector<float>* packed_buffer); -// The noise suppression (NS) component attempts to remove noise while -// retaining speech. Recommended to be enabled on the client-side. -// -// Recommended to be enabled on the client-side. -class NoiseSuppression { - public: - virtual int Enable(bool enable) = 0; - virtual bool is_enabled() const = 0; - - // Determines the aggressiveness of the suppression. Increasing the level - // will reduce the noise level at the expense of a higher speech distortion. - enum Level { - kLow, - kModerate, - kHigh, - kVeryHigh + struct Metrics { + absl::optional<double> echo_likelihood; + absl::optional<double> echo_likelihood_recent_max; }; - virtual int set_level(Level level) = 0; - virtual Level level() const = 0; - - // Returns the internally computed prior speech probability of current frame - // averaged over output channels. This is not supported in fixed point, for - // which |kUnsupportedFunctionError| is returned. - virtual float speech_probability() const = 0; - - protected: - virtual ~NoiseSuppression() {} + // Collect current metrics from the echo detector. + virtual Metrics GetMetrics() const = 0; }; -// The voice activity detection (VAD) component analyzes the stream to -// determine if voice is present. A facility is also provided to pass in an -// external VAD decision. -// -// In addition to |stream_has_voice()| the VAD decision is provided through the -// |AudioFrame| passed to |ProcessStream()|. The |vad_activity_| member will be -// modified to reflect the current decision. -class VoiceDetection { - public: - virtual int Enable(bool enable) = 0; - virtual bool is_enabled() const = 0; - - // Returns true if voice is detected in the current frame. Should be called - // after |ProcessStream()|. - virtual bool stream_has_voice() const = 0; - - // Some of the APM functionality requires a VAD decision. In the case that - // a decision is externally available for the current frame, it can be passed - // in here, before |ProcessStream()| is called. - // - // VoiceDetection does _not_ need to be enabled to use this. If it happens to - // be enabled, detection will be skipped for any frame in which an external - // VAD decision is provided. - virtual int set_stream_has_voice(bool has_voice) = 0; - - // Specifies the likelihood that a frame will be declared to contain voice. - // A higher value makes it more likely that speech will not be clipped, at - // the expense of more noise being detected as voice. - enum Likelihood { - kVeryLowLikelihood, - kLowLikelihood, - kModerateLikelihood, - kHighLikelihood - }; - - virtual int set_likelihood(Likelihood likelihood) = 0; - virtual Likelihood likelihood() const = 0; - - // Sets the |size| of the frames in ms on which the VAD will operate. Larger - // frames will improve detection accuracy, but reduce the frequency of - // updates. - // - // This does not impact the size of frames passed to |ProcessStream()|. - virtual int set_frame_size_ms(int size) = 0; - virtual int frame_size_ms() const = 0; - - protected: - virtual ~VoiceDetection() {} -}; } // namespace webrtc -#endif // WEBRTC_MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_ +#endif // MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_ |