webrtc/common_audio/vad/vad_filterbank.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331

/*
 *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "webrtc/common_audio/vad/vad_filterbank.h"

#include <assert.h>

#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
#include "webrtc/typedefs.h"

// Constants used in LogOfEnergy().
static const int16_t kLogConst = 24660;  // 160*log10(2) in Q9.
static const int16_t kLogEnergyIntPart = 14336;  // 14 in Q10

// Coefficients used by HighPassFilter, Q14.
static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 };
static const int16_t kHpPoleCoefs[3] = { 16384, -7756, 5620 };

// Allpass filter coefficients, upper and lower, in Q15.
// Upper: 0.64, Lower: 0.17
static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 };

// Adjustment for division with two in SplitFilter.
static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 };

// High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is
// sampled at 500 Hz.
//
// - data_in      [i]   : Input audio data sampled at 500 Hz.
// - data_length  [i]   : Length of input and output data.
// - filter_state [i/o] : State of the filter.
// - data_out     [o]   : Output audio data in the frequency interval
//                        80 - 250 Hz.
static void HighPassFilter(const int16_t* data_in, size_t data_length,
                           int16_t* filter_state, int16_t* data_out) {
  size_t i;
  const int16_t* in_ptr = data_in;
  int16_t* out_ptr = data_out;
  int32_t tmp32 = 0;


  // The sum of the absolute values of the impulse response:
  // The zero/pole-filter has a max amplification of a single sample of: 1.4546
  // Impulse response: 0.4047 -0.6179 -0.0266  0.1993  0.1035  -0.0194
  // The all-zero section has a max amplification of a single sample of: 1.6189
  // Impulse response: 0.4047 -0.8094  0.4047  0       0        0
  // The all-pole section has a max amplification of a single sample of: 1.9931
  // Impulse response: 1.0000  0.4734 -0.1189 -0.2187 -0.0627   0.04532

  for (i = 0; i < data_length; i++) {
    // All-zero section (filter coefficients in Q14).
    tmp32 = kHpZeroCoefs[0] * *in_ptr;
    tmp32 += kHpZeroCoefs[1] * filter_state[0];
    tmp32 += kHpZeroCoefs[2] * filter_state[1];
    filter_state[1] = filter_state[0];
    filter_state[0] = *in_ptr++;

    // All-pole section (filter coefficients in Q14).
    tmp32 -= kHpPoleCoefs[1] * filter_state[2];
    tmp32 -= kHpPoleCoefs[2] * filter_state[3];
    filter_state[3] = filter_state[2];
    filter_state[2] = (int16_t) (tmp32 >> 14);
    *out_ptr++ = filter_state[2];
  }
}

// All pass filtering of |data_in|, used before splitting the signal into two
// frequency bands (low pass vs high pass).
// Note that |data_in| and |data_out| can NOT correspond to the same address.
//
// - data_in            [i]   : Input audio signal given in Q0.
// - data_length        [i]   : Length of input and output data.
// - filter_coefficient [i]   : Given in Q15.
// - filter_state       [i/o] : State of the filter given in Q(-1).
// - data_out           [o]   : Output audio signal given in Q(-1).
static void AllPassFilter(const int16_t* data_in, size_t data_length,
                          int16_t filter_coefficient, int16_t* filter_state,
                          int16_t* data_out) {
  // The filter can only cause overflow (in the w16 output variable)
  // if more than 4 consecutive input numbers are of maximum value and
  // has the the same sign as the impulse responses first taps.
  // First 6 taps of the impulse response:
  // 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990

  size_t i;
  int16_t tmp16 = 0;
  int32_t tmp32 = 0;
  int32_t state32 = ((int32_t) (*filter_state) << 16);  // Q15

  for (i = 0; i < data_length; i++) {
    tmp32 = state32 + filter_coefficient * *data_in;
    tmp16 = (int16_t) (tmp32 >> 16);  // Q(-1)
    *data_out++ = tmp16;
    state32 = (*data_in << 14) - filter_coefficient * tmp16;  // Q14
    state32 <<= 1;  // Q15.
    data_in += 2;
  }

  *filter_state = (int16_t) (state32 >> 16);  // Q(-1)
}

// Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to
// an upper (high pass) part and a lower (low pass) part respectively.
//
// - data_in      [i]   : Input audio data to be split into two frequency bands.
// - data_length  [i]   : Length of |data_in|.
// - upper_state  [i/o] : State of the upper filter, given in Q(-1).
// - lower_state  [i/o] : State of the lower filter, given in Q(-1).
// - hp_data_out  [o]   : Output audio data of the upper half of the spectrum.
//                        The length is |data_length| / 2.
// - lp_data_out  [o]   : Output audio data of the lower half of the spectrum.
//                        The length is |data_length| / 2.
static void SplitFilter(const int16_t* data_in, size_t data_length,
                        int16_t* upper_state, int16_t* lower_state,
                        int16_t* hp_data_out, int16_t* lp_data_out) {
  size_t i;
  size_t half_length = data_length >> 1;  // Downsampling by 2.
  int16_t tmp_out;

  // All-pass filtering upper branch.
  AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state,
                hp_data_out);

  // All-pass filtering lower branch.
  AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state,
                lp_data_out);

  // Make LP and HP signals.
  for (i = 0; i < half_length; i++) {
    tmp_out = *hp_data_out;
    *hp_data_out++ -= *lp_data_out;
    *lp_data_out++ += tmp_out;
  }
}

// Calculates the energy of |data_in| in dB, and also updates an overall
// |total_energy| if necessary.
//
// - data_in      [i]   : Input audio data for energy calculation.
// - data_length  [i]   : Length of input data.
// - offset       [i]   : Offset value added to |log_energy|.
// - total_energy [i/o] : An external energy updated with the energy of
//                        |data_in|.
//                        NOTE: |total_energy| is only updated if
//                        |total_energy| <= |kMinEnergy|.
// - log_energy   [o]   : 10 * log10("energy of |data_in|") given in Q4.
static void LogOfEnergy(const int16_t* data_in, size_t data_length,
                        int16_t offset, int16_t* total_energy,
                        int16_t* log_energy) {
  // |tot_rshifts| accumulates the number of right shifts performed on |energy|.
  int tot_rshifts = 0;
  // The |energy| will be normalized to 15 bits. We use unsigned integer because
  // we eventually will mask out the fractional part.
  uint32_t energy = 0;

  assert(data_in != NULL);
  assert(data_length > 0);

  energy = (uint32_t) WebRtcSpl_Energy((int16_t*) data_in, data_length,
                                       &tot_rshifts);

  if (energy != 0) {
    // By construction, normalizing to 15 bits is equivalent with 17 leading
    // zeros of an unsigned 32 bit value.
    int normalizing_rshifts = 17 - WebRtcSpl_NormU32(energy);
    // In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
    // (14 << 10), which is what we initialize |log2_energy| with. For a more
    // detailed derivations, see below.
    int16_t log2_energy = kLogEnergyIntPart;

    tot_rshifts += normalizing_rshifts;
    // Normalize |energy| to 15 bits.
    // |tot_rshifts| is now the total number of right shifts performed on
    // |energy| after normalization. This means that |energy| is in
    // Q(-tot_rshifts).
    if (normalizing_rshifts < 0) {
      energy <<= -normalizing_rshifts;
    } else {
      energy >>= normalizing_rshifts;
    }

    // Calculate the energy of |data_in| in dB, in Q4.
    //
    // 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
    // 160 * log10(|energy| * 2^|tot_rshifts|) =
    // 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) =
    // 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) =
    // (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) =
    // |kLogConst| * (|log2_energy| + |tot_rshifts|)
    //
    // We know by construction that |energy| is normalized to 15 bits. Hence,
    // |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
    // Further, we'd like |log2_energy| in Q10
    // log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
    // 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
    // 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
    // (14 << 10) + 2^10 * (frac_Q15 * 2^-14) =
    // (14 << 10) + (frac_Q15 * 2^-4) = (14 << 10) + (frac_Q15 >> 4)
    //
    // Note that frac_Q15 = (|energy| & 0x00003FFF)

    // Calculate and add the fractional part to |log2_energy|.
    log2_energy += (int16_t) ((energy & 0x00003FFF) >> 4);

    // |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0.
    // Note that we in our derivation above have accounted for an output in Q4.
    *log_energy = (int16_t)(((kLogConst * log2_energy) >> 19) +
        ((tot_rshifts * kLogConst) >> 9));

    if (*log_energy < 0) {
      *log_energy = 0;
    }
  } else {
    *log_energy = offset;
    return;
  }

  *log_energy += offset;

  // Update the approximate |total_energy| with the energy of |data_in|, if
  // |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an
  // energy indicator in WebRtcVad_GmmProbability() in vad_core.c.
  if (*total_energy <= kMinEnergy) {
    if (tot_rshifts >= 0) {
      // We know by construction that the |energy| > |kMinEnergy| in Q0, so add
      // an arbitrary value such that |total_energy| exceeds |kMinEnergy|.
      *total_energy += kMinEnergy + 1;
    } else {
      // By construction |energy| is represented by 15 bits, hence any number of
      // right shifted |energy| will fit in an int16_t. In addition, adding the
      // value to |total_energy| is wrap around safe as long as
      // |kMinEnergy| < 8192.
      *total_energy += (int16_t) (energy >> -tot_rshifts);  // Q0.
    }
  }
}

int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in,
                                    size_t data_length, int16_t* features) {
  int16_t total_energy = 0;
  // We expect |data_length| to be 80, 160 or 240 samples, which corresponds to
  // 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will
  // have at most 120 samples after the first split and at most 60 samples after
  // the second split.
  int16_t hp_120[120], lp_120[120];
  int16_t hp_60[60], lp_60[60];
  const size_t half_data_length = data_length >> 1;
  size_t length = half_data_length;  // |data_length| / 2, corresponds to
                                     // bandwidth = 2000 Hz after downsampling.

  // Initialize variables for the first SplitFilter().
  int frequency_band = 0;
  const int16_t* in_ptr = data_in;  // [0 - 4000] Hz.
  int16_t* hp_out_ptr = hp_120;  // [2000 - 4000] Hz.
  int16_t* lp_out_ptr = lp_120;  // [0 - 2000] Hz.

  assert(data_length <= 240);
  assert(4 < kNumChannels - 1);  // Checking maximum |frequency_band|.

  // Split at 2000 Hz and downsample.
  SplitFilter(in_ptr, data_length, &self->upper_state[frequency_band],
              &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);

  // For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample.
  frequency_band = 1;
  in_ptr = hp_120;  // [2000 - 4000] Hz.
  hp_out_ptr = hp_60;  // [3000 - 4000] Hz.
  lp_out_ptr = lp_60;  // [2000 - 3000] Hz.
  SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
              &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);

  // Energy in 3000 Hz - 4000 Hz.
  length >>= 1;  // |data_length| / 4 <=> bandwidth = 1000 Hz.

  LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]);

  // Energy in 2000 Hz - 3000 Hz.
  LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]);

  // For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample.
  frequency_band = 2;
  in_ptr = lp_120;  // [0 - 2000] Hz.
  hp_out_ptr = hp_60;  // [1000 - 2000] Hz.
  lp_out_ptr = lp_60;  // [0 - 1000] Hz.
  length = half_data_length;  // |data_length| / 2 <=> bandwidth = 2000 Hz.
  SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
              &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);

  // Energy in 1000 Hz - 2000 Hz.
  length >>= 1;  // |data_length| / 4 <=> bandwidth = 1000 Hz.
  LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]);

  // For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample.
  frequency_band = 3;
  in_ptr = lp_60;  // [0 - 1000] Hz.
  hp_out_ptr = hp_120;  // [500 - 1000] Hz.
  lp_out_ptr = lp_120;  // [0 - 500] Hz.
  SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
              &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);

  // Energy in 500 Hz - 1000 Hz.
  length >>= 1;  // |data_length| / 8 <=> bandwidth = 500 Hz.
  LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]);

  // For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample.
  frequency_band = 4;
  in_ptr = lp_120;  // [0 - 500] Hz.
  hp_out_ptr = hp_60;  // [250 - 500] Hz.
  lp_out_ptr = lp_60;  // [0 - 250] Hz.
  SplitFilter(in_ptr, length, &self->upper_state[frequency_band],
              &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr);

  // Energy in 250 Hz - 500 Hz.
  length >>= 1;  // |data_length| / 16 <=> bandwidth = 250 Hz.
  LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]);

  // Remove 0 Hz - 80 Hz, by high pass filtering the lower band.
  HighPassFilter(lp_60, length, self->hp_filter_state, hp_120);

  // Energy in 80 Hz - 250 Hz.
  LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]);

  return total_energy;
}